Merge upstream changes, fix conflicts

This commit is contained in:
0cc4m 2024-02-25 17:35:05 +01:00
commit 93cdea1d7b
132 changed files with 13710 additions and 4764 deletions

37
.devops/nix/docker.nix Normal file
View file

@ -0,0 +1,37 @@
{
lib,
dockerTools,
buildEnv,
llama-cpp,
interactive ? true,
coreutils,
}:
# A tar that can be fed into `docker load`:
#
# $ nix build .#llamaPackages.docker
# $ docker load < result
# For details and variations cf.
# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
# - https://nixery.dev/
# Approximate (compressed) sizes, at the time of writing, are:
#
# .#llamaPackages.docker: 125M;
# .#llamaPackagesCuda.docker: 537M;
# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
dockerTools.buildLayeredImage {
name = llama-cpp.pname;
tag = "latest";
contents =
[ llama-cpp ]
++ lib.optionals interactive [
coreutils
dockerTools.binSh
dockerTools.caCertificates
];
}

View file

@ -255,11 +255,11 @@ effectiveStdenv.mkDerivation (
# Configurations we don't want even the CI to evaluate. Results in the # Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because # "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway. # cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin; badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
# Configurations that are known to result in build failures. Can be # Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`. # overridden by importing Nixpkgs with `allowBroken = true`.
broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin); broken = (useMetalKit && !effectiveStdenv.isDarwin);
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
homepage = "https://github.com/ggerganov/llama.cpp/"; homepage = "https://github.com/ggerganov/llama.cpp/";

View file

@ -12,5 +12,8 @@ lib.makeScope newScope (
self: { self: {
inherit llamaVersion; inherit llamaVersion;
llama-cpp = self.callPackage ./package.nix { }; llama-cpp = self.callPackage ./package.nix { };
docker = self.callPackage ./docker.nix { };
docker-min = self.callPackage ./docker.nix { interactive = false; };
sif = self.callPackage ./sif.nix { };
} }
) )

27
.devops/nix/sif.nix Normal file
View file

@ -0,0 +1,27 @@
{
lib,
singularity-tools,
llama-cpp,
bashInteractive,
interactive ? false,
}:
let
optionalInt = cond: x: if cond then x else 0;
in
singularity-tools.buildImage rec {
inherit (llama-cpp) name;
contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
# These are excessive (but safe) for most variants. Building singularity
# images requires superuser privileges, so we build them inside a VM in a
# writable image of pre-determined size.
#
# ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
#
# Expected image sizes:
# - cpu/blas: 150M,
# - cuda, all gencodes: 560M,
diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
memSize = diskSize;
}

View file

@ -1,2 +1,3 @@
[flake8] [flake8]
max-line-length = 125 max-line-length = 125
ignore = W503

View file

@ -7,3 +7,5 @@ assignees: ''
--- ---
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).

View file

@ -37,6 +37,8 @@ jobs:
- name: Build - name: Build
id: make_build id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
run: | run: |
CC=gcc-8 make -j $(nproc) CC=gcc-8 make -j $(nproc)
@ -65,7 +67,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. cmake .. -DLLAMA_FATAL_WARNINGS=ON
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
- name: Test - name: Test
@ -100,7 +102,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test - name: Test
@ -244,6 +246,8 @@ jobs:
- name: Build - name: Build
id: make_build id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
run: | run: |
LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
@ -277,7 +281,7 @@ jobs:
sysctl -a sysctl -a
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_METAL=OFF .. cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test

View file

@ -19,7 +19,6 @@ on:
jobs: jobs:
nix-build-aarch64: nix-build-aarch64:
if: ${{ vars.CACHIX_NAME != '' }}
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout repository - name: Checkout repository
@ -37,8 +36,8 @@ jobs:
extra-conf: | extra-conf: |
extra-platforms = aarch64-linux extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm extra-system-features = nixos-test kvm
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2 - uses: DeterminateSystems/magic-nix-cache-action@v2
with: with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -46,7 +45,7 @@ jobs:
uses: cachix/cachix-action@v13 uses: cachix/cachix-action@v13
with: with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }} name: llama-cpp
- name: Show all output paths - name: Show all output paths
run: > run: >
nix run github:nix-community/nix-eval-jobs nix run github:nix-community/nix-eval-jobs

View file

@ -23,8 +23,8 @@ jobs:
with: with:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: | extra-conf: |
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2 - uses: DeterminateSystems/magic-nix-cache-action@v2
with: with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -37,7 +37,6 @@ jobs:
--flake --flake
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)" ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build: nix-build:
if: ${{ vars.CACHIX_NAME != '' }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@ -51,8 +50,8 @@ jobs:
with: with:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: | extra-conf: |
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2 - uses: DeterminateSystems/magic-nix-cache-action@v2
with: with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -60,7 +59,7 @@ jobs:
uses: cachix/cachix-action@v13 uses: cachix/cachix-action@v13
with: with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: ${{ vars.CACHIX_NAME }} name: llama-cpp
- name: Build - name: Build
run: > run: >
nix run github:Mic92/nix-fast-build nix run github:Mic92/nix-fast-build

View file

@ -16,5 +16,5 @@ jobs:
- name: flake8 Lint - name: flake8 Lint
uses: py-actions/flake8@v2 uses: py-actions/flake8@v2
with: with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704" ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
exclude: "examples/*,examples/*/**,*/**/__init__.py" exclude: "examples/*,examples/*/**,*/**/__init__.py"

127
.github/workflows/server.yml vendored Normal file
View file

@ -0,0 +1,127 @@
# Server build and tests
name: Server
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
- test/server-add-ci-test # FIXME remove
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
jobs:
server:
runs-on: ubuntu-latest
strategy:
matrix:
build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan]
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
include:
- build: 'noavx'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
image: ubuntu:latest
- build: 'avx2'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
image: ubuntu:latest
- build: 'avx'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
image: ubuntu:latest
- build: 'avx512'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON'
image: ubuntu:latest
experimental: true
- build: 'cublas'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON'
image: nvidia/cuda:12.3.1-devel-ubuntu22.04
arch_not_available: true # require nvidia docker engine
- build: 'clblast'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON'
image: ubuntu:latest
arch_not_available: true
- build: 'openblas'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS'
image: ubuntu:latest
- build: 'kompute'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
image: ubuntu:latest
arch_not_available: true
- build: 'vulkan'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON'
image: ubuntu:latest
arch_not_available: true
container:
image: ${{ matrix.image }}
ports:
- 8888
options: --cpus 4
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
apt-get update
apt-get -y install \
build-essential \
pkg-config \
git \
cmake \
python3-pip \
wget \
psmisc
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast' }}
run: |
apt install -y libclblast-dev
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas' }}
run: |
apt-get -y install libopenblas-dev
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
run: |
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
apt-get update
apt-get -y install vulkan-sdk
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Download models
id: download_models
run: |
cd examples/server/tests
../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
- name: Tests
id: server_integration_test
continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }}
run: |
cd examples/server/tests
PORT=8888 ./tests.sh

2
.gitignore vendored
View file

@ -23,11 +23,13 @@
.clang-tidy .clang-tidy
.vs/ .vs/
.vscode/ .vscode/
.idea/
lcov-report/ lcov-report/
gcovr-report/ gcovr-report/
build* build*
cmake-build-*
out/ out/
tmp/ tmp/

View file

@ -55,6 +55,9 @@ option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings"
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
option(LLAMA_GPROF "llama: enable gprof" OFF) option(LLAMA_GPROF "llama: enable gprof" OFF)
# build
option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
# sanitizers # sanitizers
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
@ -107,22 +110,20 @@ option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests"
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF)
option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON) option(LLAMA_BUILD_SERVER "llama: build server example" ON)
# add perf arguments # add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF) option(LLAMA_PERF "llama: enable perf" OFF)
if (LLAMA_PERF)
add_definitions(-DGGML_PERF)
endif()
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -130,6 +131,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
# #
# Compile flags # Compile flags
# #
if (LLAMA_SYCL) if (LLAMA_SYCL)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
else() else()
@ -140,6 +142,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON) set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
@ -151,17 +154,17 @@ endif()
if (NOT MSVC) if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD) if (LLAMA_SANITIZE_THREAD)
add_compile_options(-fsanitize=thread) add_compile_options(-fsanitize=thread)
link_libraries(-fsanitize=thread) link_libraries (-fsanitize=thread)
endif() endif()
if (LLAMA_SANITIZE_ADDRESS) if (LLAMA_SANITIZE_ADDRESS)
add_compile_options(-fsanitize=address -fno-omit-frame-pointer) add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries(-fsanitize=address) link_libraries (-fsanitize=address)
endif() endif()
if (LLAMA_SANITIZE_UNDEFINED) if (LLAMA_SANITIZE_UNDEFINED)
add_compile_options(-fsanitize=undefined) add_compile_options(-fsanitize=undefined)
link_libraries(-fsanitize=undefined) link_libraries (-fsanitize=undefined)
endif() endif()
endif() endif()
@ -199,6 +202,29 @@ if (LLAMA_METAL)
# copy ggml-metal.metal to bin directory # copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
if (LLAMA_METAL_EMBED_LIBRARY)
enable_language(ASM)
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
add_custom_command(
OUTPUT ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
DEPENDS ${METALLIB_SOURCE}
COMMENT "Generate assembly for embedded Metal library"
)
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
endif()
if (LLAMA_METAL_SHADER_DEBUG) if (LLAMA_METAL_SHADER_DEBUG)
# custom command to do the following: # custom command to do the following:
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
@ -298,14 +324,17 @@ if (LLAMA_BLAS)
endif() endif()
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS) add_compile_definitions(GGML_USE_OPENBLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL) add_compile_definitions(GGML_BLAS_USE_MKL)
endif() endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
else() else()
message(WARNING "BLAS not found, please refer to " message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -330,9 +359,6 @@ if (LLAMA_CUBLAS)
set(GGML_SOURCES_CUDA ggml-cuda.cu) set(GGML_SOURCES_CUDA ggml-cuda.cu)
add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS)
# if (LLAMA_CUDA_CUBLAS)
# add_compile_definitions(GGML_CUDA_CUBLAS)
# endif()
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif() endif()
@ -387,15 +413,20 @@ if (LLAMA_MPI)
find_package(MPI) find_package(MPI)
if (MPI_C_FOUND) if (MPI_C_FOUND)
message(STATUS "MPI found") message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h) set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) set(GGML_SOURCES_MPI ggml-mpi.c)
add_compile_definitions(GGML_USE_MPI) add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
if (NOT MSVC) if (NOT MSVC)
add_compile_options(-Wno-cast-qual) add_compile_options(-Wno-cast-qual)
endif() endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI # Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed # C++ functions, so more linkage is needed
if (MPI_CXX_FOUND) if (MPI_CXX_FOUND)
@ -427,31 +458,28 @@ if (LLAMA_VULKAN)
if (Vulkan_FOUND) if (Vulkan_FOUND)
message(STATUS "Vulkan found") message(STATUS "Vulkan found")
add_library(ggml-vulkan OBJECT ggml-vulkan.cpp ggml-vulkan.h) set(GGML_HEADERS_VULKAN ggml-vulkan.h)
if (BUILD_SHARED_LIBS) set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
add_compile_definitions(GGML_USE_VULKAN) add_compile_definitions(GGML_USE_VULKAN)
if (LLAMA_VULKAN_CHECK_RESULTS) if (LLAMA_VULKAN_CHECK_RESULTS)
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_CHECK_RESULTS) add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
endif() endif()
if (LLAMA_VULKAN_DEBUG) if (LLAMA_VULKAN_DEBUG)
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_DEBUG) add_compile_definitions(GGML_VULKAN_DEBUG)
endif() endif()
if (LLAMA_VULKAN_VALIDATE) if (LLAMA_VULKAN_VALIDATE)
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_VALIDATE) add_compile_definitions(GGML_VULKAN_VALIDATE)
endif() endif()
if (LLAMA_VULKAN_RUN_TESTS) if (LLAMA_VULKAN_RUN_TESTS)
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_RUN_TESTS) add_compile_definitions(GGML_VULKAN_RUN_TESTS)
endif() endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
else() else()
message(WARNING "Vulkan not found") message(WARNING "Vulkan not found")
endif() endif()
@ -463,43 +491,45 @@ if (LLAMA_HIPBLAS)
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
endif() endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif() endif()
find_package(hip) find_package(hip REQUIRED)
find_package(hipblas) find_package(hipblas REQUIRED)
find_package(rocblas) find_package(rocblas REQUIRED)
if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "HIP and hipBLAS found")
message(STATUS "HIP and hipBLAS found")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
if (LLAMA_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA)
endif()
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
endif()
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
if (LLAMA_STATIC) set(GGML_HEADERS_ROCM ggml-cuda.h)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm") set(GGML_SOURCES_ROCM ggml-cuda.cu)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
else()
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") if (LLAMA_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA)
endif() endif()
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
endif() endif()
if (LLAMA_SYCL) if (LLAMA_SYCL)
@ -509,10 +539,14 @@ if (LLAMA_SYCL)
#todo: AOT #todo: AOT
find_package(IntelSYCL REQUIRED) find_package(IntelSYCL REQUIRED)
message(STATUS "SYCL found")
add_compile_definitions(GGML_USE_SYCL)
if (LLAMA_SYCL_F16) if (LLAMA_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16) add_compile_definitions(GGML_SYCL_F16)
endif() endif()
add_compile_definitions(GGML_USE_SYCL)
add_compile_options(-I./) #include DPCT add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR}) add_compile_options(-I/${SYCL_INCLUDE_DIR})
@ -521,7 +555,7 @@ if (LLAMA_SYCL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h) set(GGML_HEADERS_SYCL ggml-sycl.h)
set(GGML_SOURCES_SYCL ggml-sycl.cpp) set(GGML_SOURCES_SYCL ggml-sycl.cpp)
if (WIN32) if (WIN32)
@ -540,61 +574,61 @@ if (LLAMA_KOMPUTE)
endif() endif()
function(compile_shader) function(compile_shader)
set(options) set(options)
set(oneValueArgs) set(oneValueArgs)
set(multiValueArgs SOURCES) set(multiValueArgs SOURCES)
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
foreach(source ${compile_shader_SOURCES}) foreach(source ${compile_shader_SOURCES})
get_filename_component(filename ${source} NAME) get_filename_component(filename ${source} NAME)
set(spv_file ${filename}.spv) set(spv_file ${filename}.spv)
add_custom_command( add_custom_command(
OUTPUT ${spv_file} OUTPUT ${spv_file}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
COMMENT "Compiling ${source} to ${spv_file}" COMMENT "Compiling ${source} to ${spv_file}"
) )
get_filename_component(RAW_FILE_NAME ${spv_file} NAME) get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
set(FILE_NAME "shader${RAW_FILE_NAME}") set(FILE_NAME "shader${RAW_FILE_NAME}")
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
set(OUTPUT_HEADER_FILE "${HEADER_FILE}") set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
if(CMAKE_GENERATOR MATCHES "Visual Studio") if(CMAKE_GENERATOR MATCHES "Visual Studio")
add_custom_command( add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE} OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd" COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
) )
else() else()
add_custom_command( add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE} OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
) )
endif() endif()
endforeach() endforeach()
endfunction() endfunction()
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
@ -604,66 +638,66 @@ if (LLAMA_KOMPUTE)
# Compile our shaders # Compile our shaders
compile_shader(SOURCES compile_shader(SOURCES
kompute-shaders/op_scale.comp kompute-shaders/op_scale.comp
kompute-shaders/op_scale_8.comp kompute-shaders/op_scale_8.comp
kompute-shaders/op_add.comp kompute-shaders/op_add.comp
kompute-shaders/op_addrow.comp kompute-shaders/op_addrow.comp
kompute-shaders/op_mul.comp kompute-shaders/op_mul.comp
kompute-shaders/op_silu.comp kompute-shaders/op_silu.comp
kompute-shaders/op_relu.comp kompute-shaders/op_relu.comp
kompute-shaders/op_gelu.comp kompute-shaders/op_gelu.comp
kompute-shaders/op_softmax.comp kompute-shaders/op_softmax.comp
kompute-shaders/op_norm.comp kompute-shaders/op_norm.comp
kompute-shaders/op_rmsnorm.comp kompute-shaders/op_rmsnorm.comp
kompute-shaders/op_diagmask.comp kompute-shaders/op_diagmask.comp
kompute-shaders/op_mul_mat_mat_f32.comp kompute-shaders/op_mul_mat_mat_f32.comp
kompute-shaders/op_mul_mat_f16.comp kompute-shaders/op_mul_mat_f16.comp
kompute-shaders/op_mul_mat_q8_0.comp kompute-shaders/op_mul_mat_q8_0.comp
kompute-shaders/op_mul_mat_q4_0.comp kompute-shaders/op_mul_mat_q4_0.comp
kompute-shaders/op_mul_mat_q4_1.comp kompute-shaders/op_mul_mat_q4_1.comp
kompute-shaders/op_mul_mat_q6_k.comp kompute-shaders/op_mul_mat_q6_k.comp
kompute-shaders/op_getrows_f16.comp kompute-shaders/op_getrows_f16.comp
kompute-shaders/op_getrows_q4_0.comp kompute-shaders/op_getrows_q4_0.comp
kompute-shaders/op_getrows_q4_1.comp kompute-shaders/op_getrows_q4_1.comp
kompute-shaders/op_getrows_q6_k.comp kompute-shaders/op_getrows_q6_k.comp
kompute-shaders/op_rope_f16.comp kompute-shaders/op_rope_f16.comp
kompute-shaders/op_rope_f32.comp kompute-shaders/op_rope_f32.comp
kompute-shaders/op_cpy_f16_f16.comp kompute-shaders/op_cpy_f16_f16.comp
kompute-shaders/op_cpy_f16_f32.comp kompute-shaders/op_cpy_f16_f32.comp
kompute-shaders/op_cpy_f32_f16.comp kompute-shaders/op_cpy_f32_f16.comp
kompute-shaders/op_cpy_f32_f32.comp kompute-shaders/op_cpy_f32_f32.comp
) )
# Create a custom target for our generated shaders # Create a custom target for our generated shaders
add_custom_target(generated_shaders DEPENDS add_custom_target(generated_shaders DEPENDS
shaderop_scale.h shaderop_scale.h
shaderop_scale_8.h shaderop_scale_8.h
shaderop_add.h shaderop_add.h
shaderop_addrow.h shaderop_addrow.h
shaderop_mul.h shaderop_mul.h
shaderop_silu.h shaderop_silu.h
shaderop_relu.h shaderop_relu.h
shaderop_gelu.h shaderop_gelu.h
shaderop_softmax.h shaderop_softmax.h
shaderop_norm.h shaderop_norm.h
shaderop_rmsnorm.h shaderop_rmsnorm.h
shaderop_diagmask.h shaderop_diagmask.h
shaderop_mul_mat_mat_f32.h shaderop_mul_mat_mat_f32.h
shaderop_mul_mat_f16.h shaderop_mul_mat_f16.h
shaderop_mul_mat_q8_0.h shaderop_mul_mat_q8_0.h
shaderop_mul_mat_q4_0.h shaderop_mul_mat_q4_0.h
shaderop_mul_mat_q4_1.h shaderop_mul_mat_q4_1.h
shaderop_mul_mat_q6_k.h shaderop_mul_mat_q6_k.h
shaderop_getrows_f16.h shaderop_getrows_f16.h
shaderop_getrows_q4_0.h shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h shaderop_getrows_q4_1.h
shaderop_getrows_q6_k.h shaderop_getrows_q6_k.h
shaderop_rope_f16.h shaderop_rope_f16.h
shaderop_rope_f32.h shaderop_rope_f32.h
shaderop_cpy_f16_f16.h shaderop_cpy_f16_f16.h
shaderop_cpy_f16_f32.h shaderop_cpy_f16_f32.h
shaderop_cpy_f32_f16.h shaderop_cpy_f32_f16.h
shaderop_cpy_f32_f32.h shaderop_cpy_f32_f32.h
) )
# Create a custom command that depends on the generated_shaders # Create a custom command that depends on the generated_shaders
@ -676,8 +710,10 @@ if (LLAMA_KOMPUTE)
# Add the stamp to the main sources to ensure dependency tracking # Add the stamp to the main sources to ensure dependency tracking
set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
add_compile_definitions(GGML_USE_KOMPUTE) add_compile_definitions(GGML_USE_KOMPUTE)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
else() else()
@ -685,6 +721,18 @@ if (LLAMA_KOMPUTE)
endif() endif()
endif() endif()
if (LLAMA_CPU_HBM)
find_library(memkind memkind REQUIRED)
add_compile_definitions(GGML_USE_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
if (LLAMA_PERF)
add_compile_definitions(GGML_PERF)
endif()
function(get_flags CCID CCVER) function(get_flags CCID CCVER)
set(C_FLAGS "") set(C_FLAGS "")
set(CXX_FLAGS "") set(CXX_FLAGS "")
@ -709,28 +757,30 @@ function(get_flags CCID CCVER)
if (CCVER VERSION_GREATER_EQUAL 8.1.0) if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi) list(APPEND CXX_FLAGS -Wextra-semi)
endif() endif()
elseif (CCID MATCHES "Intel")
if (NOT LLAMA_SYCL)
# enable max optimization level when using Intel compiler
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
add_link_options(-fuse-ld=lld -static-intel)
endif()
endif() endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
endfunction() endfunction()
if (LLAMA_FATAL_WARNINGS)
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
list(APPEND C_FLAGS -Werror)
list(APPEND CXX_FLAGS -Werror)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
add_compile_options(/WX)
endif()
endif()
if (LLAMA_ALL_WARNINGS) if (LLAMA_ALL_WARNINGS)
if (NOT MSVC) if (NOT MSVC)
set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
set(C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-Werror=implicit-int -Werror=implicit-function-declaration) -Werror=implicit-int -Werror=implicit-function-declaration)
set(CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
set(C_FLAGS ${WARNING_FLAGS} ${C_FLAGS}) list(APPEND C_FLAGS ${WARNING_FLAGS})
set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS}) list(APPEND CXX_FLAGS ${WARNING_FLAGS})
get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
@ -746,9 +796,10 @@ endif()
set(CUDA_CXX_FLAGS "") set(CUDA_CXX_FLAGS "")
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math) set(CUDA_FLAGS -use_fast_math)
if (NOT MSVC)
list(APPEND CUDA_FLAGS -Wno-pedantic) if (LLAMA_FATAL_WARNINGS)
list(APPEND CUDA_FLAGS -Werror all-warnings)
endif() endif()
if (LLAMA_ALL_WARNINGS AND NOT MSVC) if (LLAMA_ALL_WARNINGS AND NOT MSVC)
@ -782,7 +833,11 @@ if (LLAMA_CUBLAS)
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
get_flags(${CUDA_CCID} ${CUDA_CCVER}) get_flags(${CUDA_CCID} ${CUDA_CCVER})
list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
endif()
if (NOT MSVC)
list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
endif() endif()
endif() endif()
@ -821,6 +876,7 @@ execute_process(
ERROR_VARIABLE output ERROR_VARIABLE output
OUTPUT_QUIET OUTPUT_QUIET
) )
if (output MATCHES "dyld-1015\.7") if (output MATCHES "dyld-1015\.7")
add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
endif() endif()
@ -830,10 +886,10 @@ endif()
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC) if (MSVC)
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else () else ()
set(CMAKE_GENERATOR_PLATFORM_LWR "") set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif () endif ()
if (NOT MSVC) if (NOT MSVC)
@ -855,11 +911,21 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
message(STATUS "ARM detected") message(STATUS "ARM detected")
if (MSVC) if (MSVC)
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
add_compile_definitions(__ARM_NEON) add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA) add_compile_definitions(__ARM_FEATURE_FMA)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16 set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
endif ()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
else() else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
@ -870,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2 if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) # Android armeabi-v7a
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
else()
# Raspberry Pi 2
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Android arm64-v8a
# Raspberry Pi 3, 4, Zero 2 (32-bit) # Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access) list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif() endif()
@ -1017,11 +1089,6 @@ endif()
# ggml # ggml
if (GGML_USE_CPU_HBM)
add_definitions(-DGGML_USE_CPU_HBM)
find_library(memkind memkind REQUIRED)
endif()
add_library(ggml OBJECT add_library(ggml OBJECT
ggml.c ggml.c
ggml.h ggml.h
@ -1038,16 +1105,17 @@ add_library(ggml OBJECT
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
) )
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_compile_features(ggml PUBLIC c_std_11) # don't bump target_compile_features (ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
if (GGML_USE_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>) add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>) add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
@ -1063,7 +1131,8 @@ add_library(llama
) )
target_include_directories(llama PUBLIC .) target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump target_compile_features (llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE target_link_libraries(llama PRIVATE
ggml ggml
${LLAMA_EXTRA_LIBS} ${LLAMA_EXTRA_LIBS}
@ -1114,7 +1183,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama) DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}") "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

View file

@ -97,9 +97,10 @@ endif
# #
# keep standard at C11 and C++11 # keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11
# -Ofast tends to produce faster code, but may not be available for some compilers. # -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST ifdef LLAMA_FAST
@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
MK_LDFLAGS += -g MK_LDFLAGS += -g
ifeq ($(UNAME_S),Linux) ifeq ($(UNAME_S),Linux)
MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
endif endif
else else
MK_CPPFLAGS += -DNDEBUG MK_CPPFLAGS += -DNDEBUG
@ -215,6 +216,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
-Werror=implicit-function-declaration -Werror=implicit-function-declaration
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
ifeq ($(LLAMA_FATAL_WARNINGS),1)
MK_CFLAGS += -Werror
MK_CXXFLAGS += -Werror
endif
# this version of Apple ld64 is buggy # this version of Apple ld64 is buggy
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))' ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@ -379,6 +385,9 @@ ifdef LLAMA_CUBLAS
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
MK_NVCCFLAGS += -use_fast_math MK_NVCCFLAGS += -use_fast_math
ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings
endif # LLAMA_FATAL_WARNINGS
ifndef JETSON_EOL_MODULE_DETECT ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
@ -437,9 +446,9 @@ ifdef LLAMA_CUDA_CCBIN
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
ifdef JETSON_EOL_MODULE_DETECT ifdef JETSON_EOL_MODULE_DETECT
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
else else
$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
@ -524,11 +533,29 @@ ifdef LLAMA_METAL
ifdef LLAMA_METAL_NDEBUG ifdef LLAMA_METAL_NDEBUG
MK_CPPFLAGS += -DGGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG
endif endif
ifdef LLAMA_METAL_EMBED_LIBRARY
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
OBJS += ggml-metal-embed.o
endif
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_METAL ifdef LLAMA_METAL
ggml-metal.o: ggml-metal.m ggml-metal.h ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ifdef LLAMA_METAL_EMBED_LIBRARY
ggml-metal-embed.o: ggml-metal.metal
@echo "Embedding Metal library"
$(eval TEMP_ASSEMBLY=$(shell mktemp))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
@$(AS) $(TEMP_ASSEMBLY) -o $@
@rm -f ${TEMP_ASSEMBLY}
endif
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_MPI ifdef LLAMA_MPI
@ -540,9 +567,10 @@ GF_CC := $(CC)
include scripts/get-flags.mk include scripts/get-flags.mk
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS) override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
@ -550,7 +578,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
include scripts/get-flags.mk include scripts/get-flags.mk
CUDA_CXXFLAGS := $(GF_CXXFLAGS) CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
endif endif
# #
@ -569,6 +597,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
$(info ) $(info )
@ -683,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
@ -854,3 +890,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View file

@ -13,17 +13,31 @@ let package = Package(
products: [ products: [
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
dependencies: [
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
],
targets: [ targets: [
.target( .target(
name: "llama", name: "llama",
dependencies: ["ggml"],
path: ".", path: ".",
exclude: ["ggml-metal.metal"], exclude: [
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"ggml-cuda.cu",
"ggml-cuda.h",
"Makefile"
],
sources: [ sources: [
"ggml.c",
"llama.cpp", "llama.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
], ],
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [ cSettings: [

View file

@ -272,7 +272,7 @@ Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**. Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder. Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.

View file

@ -10,13 +10,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics ### Hot topics
- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240 - Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138 - Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
- [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series) - Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
- Collecting Apple Silicon performance stats:
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
---- ----
@ -61,7 +57,7 @@ variety of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies - Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures - AVX, AVX2 and AVX512 support for x86 architectures
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP) - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan, SYCL, and (partial) OpenCL backend support - Vulkan, SYCL, and (partial) OpenCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
@ -107,6 +103,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
- [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
- [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma)
**Multimodal models:** **Multimodal models:**
@ -145,6 +142,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [nat/openplayground](https://github.com/nat/openplayground) - [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary) - [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
@ -156,6 +154,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [semperai/amica](https://github.com/semperai/amica) - [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai) - [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary)
--- ---
@ -768,7 +768,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
#### How to run #### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research 1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output: 3. Output:
``` ```
@ -958,7 +958,7 @@ We have three Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`) 3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above: Additionally, there the following images, similar to the above:

View file

@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const train = make.obj("train", "common/train.cpp"); const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp"); const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser }); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip }); const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) { if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32"); server.linkSystemLibrary("ws2_32");
} }

View file

@ -33,7 +33,7 @@ sd=`dirname $0`
cd $sd/../ cd $sd/../
SRC=`pwd` SRC=`pwd`
CMAKE_EXTRA="" CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON" CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
@ -219,7 +219,7 @@ function gg_run_open_llama_3b_v2 {
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
@ -401,7 +401,7 @@ function gg_run_open_llama_7b_v2 {
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2" path_models="../models-mnt/open-llama/7B-v2"
@ -568,6 +568,54 @@ function gg_sum_open_llama_7b_v2 {
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
} }
# bge-small
function gg_run_embd_bge_small {
cd ${SRC}
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
path_models="../models-mnt/bge-small"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}
function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
}
## main ## main
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
@ -591,6 +639,8 @@ test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2 test $ret -eq 0 && gg_run open_llama_3b_v2

View file

@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
std::string value(argv[i]); std::string value(argv[i]);
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
else { invalid_param = true; break; } else { invalid_param = true; break; }
} else if (arg == "--rope-scale") { } else if (arg == "--rope-scale") {
if (++i >= argc) { if (++i >= argc) {
@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
sparams.samplers_sequence = parse_samplers_input(argv[i]); const auto sampler_names = string_split(argv[i], ';');
sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
} else if (arg == "--sampling-seq") { } else if (arg == "--sampling-seq") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
sparams.samplers_sequence = argv[i]; sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
} else if (arg == "--top-p") { } else if (arg == "--top-p") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -629,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} }
std::string arg_next = argv[i]; std::string arg_next = argv[i];
if (arg_next == "none") { if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_NONE; params.split_mode = LLAMA_SPLIT_MODE_NONE;
} else if (arg_next == "layer") { } else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_LAYER; params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (arg_next == "row") { } else if (arg_next == "row") {
params.split_mode = LLAMA_SPLIT_ROW; params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else { } else {
invalid_param = true; invalid_param = true;
break; break;
@ -670,7 +671,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} else if (arg == "--no-mmap") { } else if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--numa") { } else if (arg == "--numa") {
params.numa = true; if (++i >= argc) {
invalid_param = true;
break;
}
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
} else if (arg == "--verbose-prompt") { } else if (arg == "--verbose-prompt") {
params.verbose_prompt = true; params.verbose_prompt = true;
} else if (arg == "--no-display-prompt") { } else if (arg == "--no-display-prompt") {
@ -828,15 +837,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
sep++; sep++;
if (strncmp(sep, "int:", 4) == 0) { if (strncmp(sep, "int:", 4) == 0) {
sep += 4; sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_INT; kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.int_value = std::atol(sep); kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) { } else if (strncmp(sep, "float:", 6) == 0) {
sep += 6; sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
kvo.float_value = std::atof(sep); kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) { } else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5; sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_BOOL; kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) { if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true; kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) { } else if (std::strcmp(sep, "false") == 0) {
@ -906,6 +915,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sparams; const llama_sampling_params & sparams = params.sparams;
std::string sampler_type_chars;
std::string sampler_type_names;
for (const auto sampler_type : sparams.samplers_sequence) {
sampler_type_chars += static_cast<char>(sampler_type);
sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
}
sampler_type_names.pop_back();
printf("\n"); printf("\n");
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
printf("\n"); printf("\n");
@ -926,7 +943,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -tb N, --threads-batch N\n"); printf(" -tb N, --threads-batch N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" -td N, --threads-draft N"); printf(" -td N, --threads-draft N");
printf(" number of threads to use during generation (default: same as --threads)"); printf(" number of threads to use during generation (default: same as --threads)\n");
printf(" -tbd N, --threads-batch-draft N\n"); printf(" -tbd N, --threads-batch-draft N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
printf(" -p PROMPT, --prompt PROMPT\n"); printf(" -p PROMPT, --prompt PROMPT\n");
@ -947,8 +964,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n"); printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str()); printf(" (default: %s)\n", sampler_type_names.c_str());
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@ -996,7 +1014,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base"); printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
@ -1013,7 +1031,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_supports_mmap()) { if (llama_supports_mmap()) {
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
printf(" --numa attempt optimizations that help on some NUMA systems\n"); printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
printf(" - distribute: spread execution evenly over all nodes\n");
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
printf(" - numactl: use the CPU map provided by numactl\n");
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
if (llama_supports_gpu_offload()) { if (llama_supports_gpu_offload()) {
@ -1097,45 +1118,101 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
} }
// //
// String parsing // String utils
// //
std::string parse_samplers_input(std::string input) { std::vector<std::string> string_split(std::string input, char separator) {
std::string output = ""; std::vector<std::string> parts;
size_t separator_pos = input.find(separator);
while (separator_pos != std::string::npos) {
std::string part = input.substr(0, separator_pos);
parts.emplace_back(part);
input = input.substr(separator_pos + 1);
separator_pos = input.find(separator);
}
parts.emplace_back(input);
return parts;
}
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K},
{"top_p", llama_sampler_type::TOP_P},
{"typical_p", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"temperature", llama_sampler_type::TEMPERATURE}
};
// since samplers names are written multiple ways // since samplers names are written multiple ways
// make it ready for both system names and input names // make it ready for both system names and input names
std::unordered_map<std::string, char> samplers_symbols { std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
{"top_k", 'k'}, {"top-k", llama_sampler_type::TOP_K},
{"top-k", 'k'}, {"top-p", llama_sampler_type::TOP_P},
{"top_p", 'p'}, {"nucleus", llama_sampler_type::TOP_P},
{"top-p", 'p'}, {"typical-p", llama_sampler_type::TYPICAL_P},
{"nucleus", 'p'}, {"typical", llama_sampler_type::TYPICAL_P},
{"typical_p", 'y'}, {"min-p", llama_sampler_type::MIN_P},
{"typical-p", 'y'}, {"tfs-z", llama_sampler_type::TFS_Z},
{"typical", 'y'}, {"tfs", llama_sampler_type::TFS_Z},
{"min_p", 'm'}, {"temp", llama_sampler_type::TEMPERATURE}
{"min-p", 'm'},
{"tfs_z", 'f'},
{"tfs-z", 'f'},
{"tfs", 'f'},
{"temp", 't'},
{"temperature",'t'}
}; };
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
size_t separator = input.find(';');
while (separator != input.npos) {
std::string name = input.substr(0,separator);
input = input.substr(separator+1);
separator = input.find(';');
if (samplers_symbols.find(name) != samplers_symbols.end()) { std::vector<llama_sampler_type> sampler_types;
output += samplers_symbols[name]; sampler_types.reserve(names.size());
for (const auto & name : names)
{
auto sampler_item = sampler_canonical_name_map.find(name);
if (sampler_item != sampler_canonical_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
else
{
if (allow_alt_names)
{
sampler_item = sampler_alt_name_map.find(name);
if (sampler_item != sampler_alt_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
}
} }
} }
if (samplers_symbols.find(input) != samplers_symbols.end()) { return sampler_types;
output += samplers_symbols[input]; }
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
std::unordered_map<char, llama_sampler_type> sampler_name_map {
{'k', llama_sampler_type::TOP_K},
{'p', llama_sampler_type::TOP_P},
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names_string.size());
for (const auto & c : names_string) {
const auto sampler_item = sampler_name_map.find(c);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
}
}
return sampler_types;
}
std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
switch (sampler_type) {
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMPERATURE: return "temperature";
default : return "";
} }
return output;
} }
// //
@ -1550,6 +1627,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
#ifdef NDEBUG #ifdef NDEBUG
fprintf(stream, "debug: false\n"); fprintf(stream, "debug: false\n");
@ -1626,6 +1704,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
} }
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
@ -1639,7 +1718,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
@ -1664,7 +1742,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
@ -1673,7 +1751,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
@ -1724,7 +1802,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
if (cs_curr[j] < 0) { continue; } if (cs_curr[j] < 0) { continue; }
if (seqs.find(cs_curr[j]) == seqs.end()) { if (seqs.find(cs_curr[j]) == seqs.end()) {
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
seqs[cs_curr[j]] = seqs.size(); const size_t sz = seqs.size();
seqs[cs_curr[j]] = sz;
} }
} }
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }

View file

@ -61,7 +61,7 @@ struct gpt_params {
float p_split = 0.1f; // speculative decoding split probability float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t n_beams = 0; // if non-zero then use beam search of given width.
@ -75,7 +75,8 @@ struct gpt_params {
float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
// // sampling parameters // // sampling parameters
struct llama_sampling_params sparams; struct llama_sampling_params sparams;
@ -134,7 +135,6 @@ struct gpt_params {
bool logits_all = false; // return logits for all tokens in the batch bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode bool infill = false; // use infill mode
@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input); void process_escapes(std::string& input);
// //
// String parsing // String utils
// //
std::string parse_samplers_input(std::string input); std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
// //
// Model utils // Model utils

View file

@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties "; std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (auto s : params.samplers_sequence) { for (auto sampler_type : params.samplers_sequence) {
switch (s) { const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
case 'k': result += "-> top_k "; break; if (!sampler_type_name.empty()) {
case 'f': result += "-> tfs_z "; break; result += "-> " + sampler_type_name + " ";
case 'y': result += "-> typical_p "; break;
case 'p': result += "-> top_p "; break;
case 'm': result += "-> min_p "; break;
case 't': result += "-> temp "; break;
default : break;
} }
} }
} else { } else {
@ -126,9 +121,7 @@ static void sampler_queue(
struct llama_context * ctx_main, struct llama_context * ctx_main,
const llama_sampling_params & params, const llama_sampling_params & params,
llama_token_data_array & cur_p, llama_token_data_array & cur_p,
size_t & min_keep) { size_t min_keep) {
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp; const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range; const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent; const float dynatemp_exponent = params.dynatemp_exponent;
@ -137,16 +130,16 @@ static void sampler_queue(
const float min_p = params.min_p; const float min_p = params.min_p;
const float tfs_z = params.tfs_z; const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p; const float typical_p = params.typical_p;
const std::string & samplers_sequence = params.samplers_sequence; const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
for (auto s : samplers_sequence) { for (auto sampler_type : samplers_sequence) {
switch (s){ switch (sampler_type) {
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case 't': case llama_sampler_type::TEMPERATURE:
if (dynatemp_range > 0) { if (dynatemp_range > 0) {
float dynatemp_min = std::max(0.0f, temp - dynatemp_range); float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
float dynatemp_max = std::max(0.0f, temp + dynatemp_range); float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
@ -256,7 +249,7 @@ static llama_token llama_sampling_sample_impl(
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu); id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else { } else {
// temperature sampling // temperature sampling
size_t min_keep = std::max(1, params.n_probs); size_t min_keep = std::max(1, params.min_keep);
sampler_queue(ctx_main, params, cur_p, min_keep); sampler_queue(ctx_main, params, cur_p, min_keep);

View file

@ -8,10 +8,21 @@
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
// sampler types
enum class llama_sampler_type : char {
TOP_K = 'k',
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
TYPICAL_P = 'y',
TEMPERATURE = 't'
};
// sampling parameters // sampling parameters
typedef struct llama_sampling_params { typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled float min_p = 0.05f; // 0.0 = disabled
@ -28,7 +39,15 @@ typedef struct llama_sampling_params {
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = true; // consider newlines as a repeatable token bool penalize_nl = true; // consider newlines as a repeatable token
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
llama_sampler_type::TFS_Z,
llama_sampler_type::TYPICAL_P,
llama_sampler_type::TOP_P,
llama_sampler_type::MIN_P,
llama_sampler_type::TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling std::string grammar; // optional BNF-like grammar to constrain sampling

View file

@ -31,7 +31,7 @@ struct train_state * init_train_state() {
state->opt = new struct ggml_opt_context; state->opt = new struct ggml_opt_context;
state->opt->ctx = NULL; state->opt->ctx = NULL;
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM); state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
state->opt->loss_after = 0.0f; state->opt->loss_after = 0.0f;
@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
std::string opt_type; std::string opt_type;
GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
opt->params.type = GGML_OPT_ADAM; opt->params.type = GGML_OPT_TYPE_ADAM;
GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
} else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
opt->params.type = GGML_OPT_LBFGS; opt->params.type = GGML_OPT_TYPE_LBFGS;
GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
switch (opt->params.type) { switch (opt->params.type) {
case GGML_OPT_ADAM: case GGML_OPT_TYPE_ADAM:
{ {
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best);
@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
gguf_add_tensor(fctx, opt->adam.pf); gguf_add_tensor(fctx, opt->adam.pf);
} }
} break; } break;
case GGML_OPT_LBFGS: case GGML_OPT_TYPE_LBFGS:
{ {
gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);

View file

@ -10,7 +10,7 @@ import re
import sys import sys
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
import numpy as np import numpy as np
import torch import torch
@ -25,15 +25,6 @@ import gguf
from convert import HfVocab from convert import HfVocab
# check for any of the given keys in the dictionary and return the value of the first key found
def get_key_opts(d, keys):
for k in keys:
if k in d:
return d[k]
print(f"Could not find any of {keys}")
sys.exit()
###### MODEL DEFINITIONS ###### ###### MODEL DEFINITIONS ######
class SentencePieceTokenTypes(IntEnum): class SentencePieceTokenTypes(IntEnum):
@ -58,6 +49,15 @@ class Model:
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model)
self.model_arch = self._get_model_architecture() self.model_arch = self._get_model_architecture()
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
key = next((k for k in keys if k in self.hparams), None)
if key is not None:
return self.hparams[key]
if optional:
return None
raise KeyError(f"could not find any of: {keys}")
def set_vocab(self): def set_vocab(self):
self._set_vocab_gpt2() self._set_vocab_gpt2()
@ -79,28 +79,33 @@ class Model:
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams.get( self.gguf_writer.add_block_count(self.block_count)
"n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
)) if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_context_length(n_ctx)
if (n_embd := self.hparams.get("hidden_size")) is not None:
self.gguf_writer.add_embedding_length(n_embd) n_embd = self.find_hparam(["hidden_size", "n_embd"])
if (n_ff := self.hparams.get("intermediate_size")) is not None: self.gguf_writer.add_embedding_length(n_embd)
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
self.gguf_writer.add_feed_forward_length(n_ff) self.gguf_writer.add_feed_forward_length(n_ff)
if (n_head := self.hparams.get("num_attention_heads")) is not None:
self.gguf_writer.add_head_count(n_head) n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_head_count(n_head)
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv) self.gguf_writer.add_head_count_kv(n_head_kv)
if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
if (n_experts := self.hparams.get("num_local_experts")) is not None: if (n_experts := self.hparams.get("num_local_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts) self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used) self.gguf_writer.add_expert_used_count(n_experts_used)
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self): def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@ -187,7 +192,7 @@ class Model:
return RefactModel return RefactModel
if model_architecture == "PersimmonForCausalLM": if model_architecture == "PersimmonForCausalLM":
return PersimmonModel return PersimmonModel
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return StableLMModel return StableLMModel
if model_architecture == "QWenLMHeadModel": if model_architecture == "QWenLMHeadModel":
return QwenModel return QwenModel
@ -209,6 +214,12 @@ class Model:
return InternLM2Model return InternLM2Model
if model_architecture == "MiniCPMForCausalLM": if model_architecture == "MiniCPMForCausalLM":
return MiniCPMModel return MiniCPMModel
if model_architecture == "BertModel":
return BertModel
if model_architecture == "NomicBertModel":
return NomicBertModel
if model_architecture == "GemmaForCausalLM":
return GemmaModel
return Model return Model
def _is_model_safetensors(self) -> bool: def _is_model_safetensors(self) -> bool:
@ -242,7 +253,7 @@ class Model:
return gguf.MODEL_ARCH.REFACT return gguf.MODEL_ARCH.REFACT
if arch == "PersimmonForCausalLM": if arch == "PersimmonForCausalLM":
return gguf.MODEL_ARCH.PERSIMMON return gguf.MODEL_ARCH.PERSIMMON
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return gguf.MODEL_ARCH.STABLELM return gguf.MODEL_ARCH.STABLELM
if arch == "QWenLMHeadModel": if arch == "QWenLMHeadModel":
return gguf.MODEL_ARCH.QWEN return gguf.MODEL_ARCH.QWEN
@ -264,6 +275,12 @@ class Model:
return gguf.MODEL_ARCH.INTERNLM2 return gguf.MODEL_ARCH.INTERNLM2
if arch == "MiniCPMForCausalLM": if arch == "MiniCPMForCausalLM":
return gguf.MODEL_ARCH.MINICPM return gguf.MODEL_ARCH.MINICPM
if arch == "BertModel":
return gguf.MODEL_ARCH.BERT
if arch == "NomicBertModel":
return gguf.MODEL_ARCH.NOMIC_BERT
if arch == "GemmaForCausalLM":
return gguf.MODEL_ARCH.GEMMA
raise NotImplementedError(f'Architecture "{arch}" not supported!') raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -605,11 +622,6 @@ class MPTModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
# note: MPT output is tied to (same as) wte in original model;
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
if new_name == "token_embd.weight":
self.gguf_writer.add_tensor("output.weight", data)
class OrionModel(Model): class OrionModel(Model):
def set_vocab(self): def set_vocab(self):
@ -642,6 +654,8 @@ class OrionModel(Model):
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_head_count_kv(head_count_kv)
# note: config provides rms norm but it is actually layer norm
# ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
def write_tensors(self): def write_tensors(self):
@ -1018,7 +1032,6 @@ class PersimmonModel(Model):
self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
@ -1061,10 +1074,11 @@ class StableLMModel(Model):
self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
self.gguf_writer.add_layer_norm_eps(1e-5) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
class MixtralModel(Model): class MixtralModel(Model):
@ -1293,21 +1307,21 @@ class GPT2Model(Model):
class Phi2Model(Model): class Phi2Model(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"]) block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"]) rot_pct = self.find_hparam(["partial_rotary_factor"])
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"]) n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"]) n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_name("Phi2") self.gguf_writer.add_name("Phi2")
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"])) self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_feed_forward_length(4 * n_embd)
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head) self.gguf_writer.add_head_count_kv(n_head)
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"])) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_add_bos_token(False)
@ -1629,6 +1643,206 @@ in chat mode so that the conversation can end normally.")
self.post_write_tensors(tensor_map, name, data_torch) self.post_write_tensors(tensor_map, name, data_torch)
class BertModel(Model):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.vocab_size = None
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_causal_attention(False)
# get pooling path
with open(self.dir_model / "modules.json", encoding="utf-8") as f:
modules = json.load(f)
pooling_path = None
for mod in modules:
if mod["type"] == "sentence_transformers.models.Pooling":
pooling_path = mod["path"]
break
# get pooling type
pooling_type = gguf.PoolingType.NONE
if pooling_path is not None:
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
pooling = json.load(f)
if pooling["pooling_mode_mean_tokens"]:
pooling_type = gguf.PoolingType.MEAN
elif pooling["pooling_mode_cls_token"]:
pooling_type = gguf.PoolingType.CLS
else:
raise NotImplementedError("Only MEAN and CLS pooling types supported")
self.gguf_writer.add_pooling_type(pooling_type.value)
def set_vocab(self):
path = self.dir_model
added_tokens_path = self.dir_model if self.dir_model.exists() else None
# use huggingface vocab to get all tokens
vocab = HfVocab(path, added_tokens_path)
tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size
self.vocab_size = vocab.vocab_size
# we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings
n_token_types = len(set(toktypes))
self.gguf_writer.add_token_type_count(n_token_types)
# convert to phantom space vocab
def phantom(tok, typ):
if tok.startswith(b"[") and tok.endswith(b"]"):
return tok
if tok.startswith(b"##"):
return tok[2:]
return b"\xe2\x96\x81" + tok
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
# set up bos and eos tokens (cls and sep)
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
# add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
# handle special tokens
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def write_tensors(self):
tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
tensors = dict(self.get_tensors())
for name, data_torch in tensors.items():
# we are only using BERT for embeddings so we don't need the pooling layer
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
continue # we don't need these
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
data = data_torch.squeeze().numpy()
n_dims = len(data.shape)
new_dtype: type[np.floating[Any]]
if (
self.ftype == 1 and name.endswith(".weight") and n_dims == 2
and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
):
# if f16 desired, convert any float32 2-dim weight tensors to float16
new_dtype = np.float16
else:
# if f32 desired, convert any float16 to float32
new_dtype = np.float32
print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
if data.dtype != new_dtype:
data = data.astype(new_dtype)
self.gguf_writer.add_tensor(new_name, data)
class NomicBertModel(BertModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# the HF config claims n_ctx=8192, but it uses RoPE scaling
self.hparams["n_ctx"] = 2048
# SwigLU activation
assert self.hparams["activation_function"] == "swiglu"
# this doesn't do anything in the HF version
assert self.hparams["causal"] is False
# no bias tensors
assert self.hparams["qkv_proj_bias"] is False
assert self.hparams["mlp_fc1_bias"] is False
assert self.hparams["mlp_fc2_bias"] is False
# norm at end of layer
assert self.hparams["prenorm"] is False
# standard RoPE
assert self.hparams["rotary_emb_fraction"] == 1.0
assert self.hparams["rotary_emb_interleaved"] is False
assert self.hparams["rotary_emb_scale_base"] is None
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
def get_tensors(self):
assert self.vocab_size is not None
for name, data in super().get_tensors():
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
data = data[:self.vocab_size, :]
yield name, data
class GemmaModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_key_length(hparams["head_dim"])
self.gguf_writer.add_value_length(hparams["head_dim"])
self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
if name.endswith("norm.weight"):
data_torch = data_torch + 1
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View file

@ -88,7 +88,8 @@ def main():
gguf_writer.add_embedding_length(hidden_size) gguf_writer.add_embedding_length(hidden_size)
gguf_writer.add_block_count(block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size) gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
gguf_writer.add_rope_dimension_count(hidden_size // head_count) # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
gguf_writer.add_head_count(head_count) gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base) gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)

View file

@ -1173,7 +1173,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
for (name, tensor) in model.items()} for (name, tensor) in model.items()}
def convert_model_names(model: LazyModel, params: Params) -> LazyModel: def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer) tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
@ -1199,7 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
for name, lazy_tensor in model.items(): for name, lazy_tensor in model.items():
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
if name_new is None: if name_new is None:
raise Exception(f"Unexpected tensor name: {name}") if skip_unknown:
print(f"Unexpected tensor name: {name} - skipping")
continue
else:
raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
if tensor_type in should_skip: if tensor_type in should_skip:
print(f"skipping tensor {name_new}") print(f"skipping tensor {name_new}")
@ -1377,19 +1381,20 @@ def main(args_in: list[str] | None = None) -> None:
output_choices.append("q8_0") output_choices.append("q8_0")
vocab_types = ["spm", "bpe", "hfft"] vocab_types = ["spm", "bpe", "hfft"]
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
args = parser.parse_args(args_in) args = parser.parse_args(args_in)
if args.awq_path: if args.awq_path:
@ -1461,7 +1466,7 @@ def main(args_in: list[str] | None = None) -> None:
print(f"Special vocab info: {special_vocab}") print(f"Special vocab info: {special_vocab}")
model = model_plus.model model = model_plus.model
model = convert_model_names(model, params) model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype) ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype) model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype) outfile = args.outfile or default_outfile(model_plus.paths, ftype)

View file

@ -38,6 +38,7 @@ else()
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(lookahead) add_subdirectory(lookahead)
add_subdirectory(lookup) add_subdirectory(lookup)
add_subdirectory(gguf)
add_subdirectory(train-text-from-scratch) add_subdirectory(train-text-from-scratch)
add_subdirectory(imatrix) add_subdirectory(imatrix)
if (LLAMA_BUILD_SERVER) if (LLAMA_BUILD_SERVER)

View file

@ -1533,27 +1533,28 @@ int main(int argc, char ** argv) {
int n_past = 0; int n_past = 0;
ggml_cgraph gf = {}; struct ggml_cgraph * gf = NULL;
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets); get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
// struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits); // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits); struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
ggml_build_forward_expand(&gf, e); ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
float error_before_opt = ggml_get_f32_1d(e, 0); float error_before_opt = ggml_get_f32_1d(e, 0);
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_forward_graph = false;
opt_params_lbfgs.print_backward_graph = false; opt_params_lbfgs.print_backward_graph = false;
opt_params_lbfgs.lbfgs.n_iter = 16; opt_params_lbfgs.lbfgs.n_iter = 16;
ggml_opt(ctx0, opt_params_lbfgs, e); ggml_opt(ctx0, opt_params_lbfgs, e);
// //
ggml_build_forward_expand(&gf, e); ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
float error_after_opt = ggml_get_f32_1d(e, 0); float error_after_opt = ggml_get_f32_1d(e, 0);
@ -1600,13 +1601,14 @@ int main(int argc, char ** argv) {
}; };
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph gf = {}; struct ggml_cgraph * gf = NULL;
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
int n_past = 0; int n_past = 0;
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
ggml_build_forward_expand(&gf, logits); ggml_build_forward_expand(gf, logits);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);

View file

@ -82,7 +82,8 @@ int main(int argc, char ** argv) {
// init LLM // init LLM
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
// initialize the model // initialize the model
@ -158,7 +159,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View file

@ -17,7 +17,7 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu
let n_len: Int = 32 let n_len: Int = 32
// init LLM // init LLM
llama_backend_init(false) llama_backend_init()
defer { defer {
llama_backend_free() llama_backend_free()
} }

View file

@ -50,7 +50,8 @@ int main(int argc, char ** argv) {
// init LLM // init LLM
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
// initialize the model // initialize the model
@ -91,7 +92,7 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens // make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) { if (n_kv_req > n_ctx) {

View file

@ -119,7 +119,8 @@ int main(int argc, char ** argv)
// Init LLM : // Init LLM :
//--------------------------------- //---------------------------------
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -325,14 +325,14 @@ struct train_params {
}; };
static void print_params(struct my_llama_hparams * params) { static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd); printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_mult: %d\n", __func__, params->n_mult); printf("%s: n_mult: %u\n", __func__, params->n_mult);
printf("%s: n_head: %d\n", __func__, params->n_head); printf("%s: n_head: %u\n", __func__, params->n_head);
printf("%s: n_ff: %d\n", __func__, params->n_ff); printf("%s: n_ff: %u\n", __func__, params->n_ff);
printf("%s: n_layer: %d\n", __func__, params->n_layer); printf("%s: n_layer: %u\n", __func__, params->n_layer);
printf("%s: n_rot: %d\n", __func__, params->n_rot); printf("%s: n_rot: %u\n", __func__, params->n_rot);
} }
static void init_model(struct my_llama_model * model) { static void init_model(struct my_llama_model * model) {
@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) {
model->train_tokens = 0; model->train_tokens = 0;
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd); printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
// printing the per-layer allocations here so we dont print in the for loop. // printing the per-layer allocations here so we dont print in the for loop.
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer); printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
ggml_set_name(model->norm, "norm.weight"); ggml_set_name(model->norm, "norm.weight");

View file

@ -7,6 +7,51 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static std::vector<std::string> split_lines(const std::string & s) {
std::string line;
std::vector<std::string> lines;
std::stringstream ss(s);
while (std::getline(ss, line)) {
lines.push_back(line);
}
return lines;
}
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
}
}
static void normalize(float * vec, float * out, int n) {
float norm = 0;
for (int i = 0; i < n; i++) {
norm += vec[i] * vec[i];
}
norm = sqrt(norm);
for (int i = 0; i < n; i++) {
out[i] = vec[i] / norm;
}
}
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
// run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__);
}
// normalize on copy
for (int k = 0; k < n_seq; k++) {
float * emb = llama_get_embeddings_ith(ctx, k);
float * out = output + k * n_embd;
normalize(emb, out, n_embd);
}
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
@ -29,7 +74,8 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;
@ -55,49 +101,84 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", get_system_info(params).c_str());
} }
int n_past = 0; // split the prompt into lines
std::vector<std::string> prompts = split_lines(params.prompt);
// tokenize the prompt // max batch size
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch == params.n_ctx);
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true);
if (inp.size() > n_batch) {
inp.resize(n_batch);
}
inputs.push_back(inp);
}
// tokenization stats
if (params.verbose_prompt) { if (params.verbose_prompt) {
fprintf(stderr, "\n"); for (int i = 0; i < (int) inputs.size(); i++) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int j = 0; j < (int) inputs[i].size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
}
fprintf(stderr, "\n\n");
} }
fprintf(stderr, "\n");
} }
if (embd_inp.size() > (size_t)n_ctx) { // initialize batch
fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", const int n_prompts = prompts.size();
__func__, embd_inp.size(), n_ctx); struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
return 1;
}
while (!embd_inp.empty()) {
int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
n_past += n_tokens;
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
}
// allocate output
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
const auto * embeddings = llama_get_embeddings(ctx); std::vector<float> embeddings(n_prompts * n_embd, 0);
float * emb = embeddings.data();
for (int i = 0; i < n_embd; i++) { // break into batches
printf("%f ", embeddings[i]); int p = 0; // number of prompts processed already
int s = 0; // number of prompts in current batch
for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens
auto & inp = inputs[k];
const uint64_t n_toks = inp.size();
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd);
llama_batch_clear(batch);
p += s;
s = 0;
}
// add to batch
batch_add_seq(batch, inp, s);
s += 1;
} }
printf("\n");
// final batch
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd);
// print first 3 embeddings
for (int j = 0; j < std::min(3, n_prompts); j++) {
fprintf(stderr, "embedding %d: ", j);
for (int i = 0; i < n_embd; i++) {
fprintf(stderr, "%f ", emb[j * n_embd + i]);
}
fprintf(stderr, "\n\n");
}
fprintf(stderr, "\n");
// clean up
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -7,8 +7,6 @@
#include <string> #include <string>
#include <thread> #include <thread>
static const size_t tensor_alignment = 32;
struct lora_info { struct lora_info {
std::string filename; std::string filename;
float scale; float scale;
@ -337,24 +335,14 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
params.mem_buffer = NULL; params.mem_buffer = NULL;
params.no_alloc = true; params.no_alloc = true;
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
struct ggml_allocr * alloc = NULL; struct ggml_gallocr * alloc = NULL;
struct ggml_cgraph * gf = NULL; struct ggml_cgraph * gf = NULL;
ctx = ggml_init(params); ctx = ggml_init(params);
alloc = ggml_allocr_new_measure(tensor_alignment); alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
ggml_allocr_free(alloc);
ggml_free(ctx);
static std::vector<uint8_t> data_compute; ggml_gallocr_alloc_graph(alloc, gf);
data_compute.resize(alloc_size + tensor_alignment);
ctx = ggml_init(params);
alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
ggml_allocr_alloc_graph(alloc, gf);
ggml_allocr_free(alloc);
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
static std::vector<uint8_t> data_work; static std::vector<uint8_t> data_work;
@ -363,6 +351,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
ggml_graph_compute(gf, &cplan); ggml_graph_compute(gf, &cplan);
ggml_gallocr_free(alloc);
ggml_free(ctx); ggml_free(ctx);
return true; return true;
} }

View file

@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these
--rank-wk N LORA rank for wk tensor (default 4) --rank-wk N LORA rank for wk tensor (default 4)
--rank-wv N LORA rank for wv tensor (default 4) --rank-wv N LORA rank for wv tensor (default 4)
--rank-wo N LORA rank for wo tensor (default 4) --rank-wo N LORA rank for wo tensor (default 4)
--rank-w1 N LORA rank for w1 tensor (default 4) --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
--rank-w2 N LORA rank for w2 tensor (default 4) --rank-ffn_down N LORA rank for ffn_down tensor (default 4)
--rank-w3 N LORA rank for w3 tensor (default 4) --rank-ffn_up N LORA rank for ffn_up tensor (default 4)
``` ```
The LORA rank of 'norm' tensors should always be 1. The LORA rank of 'norm' tensors should always be 1.

View file

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h"
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "train.h" #include "train.h"
@ -13,8 +14,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static const size_t tensor_alignment = 32;
struct my_llama_hparams { struct my_llama_hparams {
uint32_t n_vocab = 32000; uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; uint32_t n_ctx = 512;
@ -61,9 +60,9 @@ struct my_llama_layer {
struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm;
// ff // ff
struct ggml_tensor * w1; struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * w2; struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * w3; struct ggml_tensor * ffn_up; // w3
}; };
struct my_llama_model { struct my_llama_model {
@ -86,9 +85,9 @@ struct my_llama_lora_hparams {
uint32_t n_rank_wv = 4; uint32_t n_rank_wv = 4;
uint32_t n_rank_wo = 4; uint32_t n_rank_wo = 4;
uint32_t n_rank_ffn_norm = 1; uint32_t n_rank_ffn_norm = 1;
uint32_t n_rank_w1 = 4; uint32_t n_rank_ffn_gate = 4;
uint32_t n_rank_w2 = 4; uint32_t n_rank_ffn_down = 4;
uint32_t n_rank_w3 = 4; uint32_t n_rank_ffn_up = 4;
uint32_t n_rank_tok_embeddings = 4; uint32_t n_rank_tok_embeddings = 4;
uint32_t n_rank_norm = 1; uint32_t n_rank_norm = 1;
uint32_t n_rank_output = 4; uint32_t n_rank_output = 4;
@ -118,17 +117,17 @@ struct my_llama_lora_layer {
struct ggml_tensor * ffn_norm_b; struct ggml_tensor * ffn_norm_b;
// ff // ff
struct ggml_tensor * w1_a; struct ggml_tensor * ffn_gate_a;
struct ggml_tensor * w1_b; struct ggml_tensor * ffn_gate_b;
struct ggml_tensor * w2_a; struct ggml_tensor * ffn_down_a;
struct ggml_tensor * w2_b; struct ggml_tensor * ffn_down_b;
struct ggml_tensor * w3_a; struct ggml_tensor * ffn_up_a;
struct ggml_tensor * w3_b; struct ggml_tensor * ffn_up_b;
}; };
struct my_llama_lora { struct my_llama_lora {
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
std::vector<uint8_t> data; ggml_backend_buffer_t data;
my_llama_lora_hparams hparams; my_llama_lora_hparams hparams;
@ -209,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) {
printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv);
printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo);
printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm);
printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1); printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate);
printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2); printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down);
printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3); printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up);
printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm);
printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output);
@ -320,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
assert_shape_1d(layer.attention_norm, hparams.n_embd); assert_shape_1d(layer.attention_norm, hparams.n_embd);
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
@ -330,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
assert_shape_1d(layer.ffn_norm, hparams.n_embd); assert_shape_1d(layer.ffn_norm, hparams.n_embd);
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff);
assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd); assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd);
assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff); assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff);
} }
} }
@ -363,69 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) {
ggml_set_param(ctx, layer.wo_b); ggml_set_param(ctx, layer.wo_b);
ggml_set_param(ctx, layer.ffn_norm_a); ggml_set_param(ctx, layer.ffn_norm_a);
ggml_set_param(ctx, layer.ffn_norm_b); ggml_set_param(ctx, layer.ffn_norm_b);
ggml_set_param(ctx, layer.w1_a); ggml_set_param(ctx, layer.ffn_gate_a);
ggml_set_param(ctx, layer.w1_b); ggml_set_param(ctx, layer.ffn_gate_b);
ggml_set_param(ctx, layer.w2_a); ggml_set_param(ctx, layer.ffn_down_a);
ggml_set_param(ctx, layer.w2_b); ggml_set_param(ctx, layer.ffn_down_b);
ggml_set_param(ctx, layer.w3_a); ggml_set_param(ctx, layer.ffn_up_a);
ggml_set_param(ctx, layer.w3_b); ggml_set_param(ctx, layer.ffn_up_b);
}
}
static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) {
ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
ggml_allocr_alloc(alloc, lora->norm_a);
ggml_allocr_alloc(alloc, lora->norm_b);
ggml_allocr_alloc(alloc, lora->output_a);
ggml_allocr_alloc(alloc, lora->output_b);
for (uint32_t i = 0; i < lora->layers.size(); ++i) {
auto & layer = lora->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm_a);
ggml_allocr_alloc(alloc, layer.attention_norm_b);
ggml_allocr_alloc(alloc, layer.wq_a);
ggml_allocr_alloc(alloc, layer.wq_b);
ggml_allocr_alloc(alloc, layer.wk_a);
ggml_allocr_alloc(alloc, layer.wk_b);
ggml_allocr_alloc(alloc, layer.wv_a);
ggml_allocr_alloc(alloc, layer.wv_b);
ggml_allocr_alloc(alloc, layer.wo_a);
ggml_allocr_alloc(alloc, layer.wo_b);
ggml_allocr_alloc(alloc, layer.ffn_norm_a);
ggml_allocr_alloc(alloc, layer.ffn_norm_b);
ggml_allocr_alloc(alloc, layer.w1_a);
ggml_allocr_alloc(alloc, layer.w1_b);
ggml_allocr_alloc(alloc, layer.w2_a);
ggml_allocr_alloc(alloc, layer.w2_b);
ggml_allocr_alloc(alloc, layer.w3_a);
ggml_allocr_alloc(alloc, layer.w3_b);
}
ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
ggml_allocr_alloc(alloc, lora->norm_a->grad);
ggml_allocr_alloc(alloc, lora->norm_b->grad);
ggml_allocr_alloc(alloc, lora->output_a->grad);
ggml_allocr_alloc(alloc, lora->output_b->grad);
for (uint32_t i = 0; i < lora->layers.size(); ++i) {
auto & layer = lora->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
ggml_allocr_alloc(alloc, layer.wq_a->grad);
ggml_allocr_alloc(alloc, layer.wq_b->grad);
ggml_allocr_alloc(alloc, layer.wk_a->grad);
ggml_allocr_alloc(alloc, layer.wk_b->grad);
ggml_allocr_alloc(alloc, layer.wv_a->grad);
ggml_allocr_alloc(alloc, layer.wv_b->grad);
ggml_allocr_alloc(alloc, layer.wo_a->grad);
ggml_allocr_alloc(alloc, layer.wo_b->grad);
ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
ggml_allocr_alloc(alloc, layer.w1_a->grad);
ggml_allocr_alloc(alloc, layer.w1_b->grad);
ggml_allocr_alloc(alloc, layer.w2_a->grad);
ggml_allocr_alloc(alloc, layer.w2_b->grad);
ggml_allocr_alloc(alloc, layer.w3_a->grad);
ggml_allocr_alloc(alloc, layer.w3_b->grad);
} }
} }
@ -493,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd); layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff); layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff); layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd); layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd); layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd);
layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff); layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff);
ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
@ -512,28 +454,18 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i));
ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i));
ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i));
ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
} }
set_param_lora(lora); set_param_lora(lora);
// measure data size // allocate data for lora tensors
size_t size = 0; lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
}
// allocate data
struct ggml_allocr * alloc = NULL;
lora->data.resize(size + tensor_alignment);
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
alloc_lora(alloc, lora);
ggml_allocr_free(alloc);
} }
static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
@ -565,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
randomize_tensor_normal(layer.ffn_norm_a, rnd); randomize_tensor_normal(layer.ffn_norm_a, rnd);
ggml_set_zero(layer.ffn_norm_b); ggml_set_zero(layer.ffn_norm_b);
randomize_tensor_normal(layer.w1_a, rnd); randomize_tensor_normal(layer.ffn_gate_a, rnd);
ggml_set_zero(layer.w1_b); ggml_set_zero(layer.ffn_gate_b);
randomize_tensor_normal(layer.w2_a, rnd); randomize_tensor_normal(layer.ffn_down_a, rnd);
ggml_set_zero(layer.w2_b); ggml_set_zero(layer.ffn_down_b);
randomize_tensor_normal(layer.w3_a, rnd); randomize_tensor_normal(layer.ffn_up_a, rnd);
ggml_set_zero(layer.w3_b); ggml_set_zero(layer.ffn_up_b);
} }
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
@ -579,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
static struct ggml_tensor * llama_build_lora_finetune_graphs( static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct my_llama_model * model, struct my_llama_model * model,
struct my_llama_lora * lora, struct my_llama_lora * lora,
struct ggml_allocr * alloc, ggml_gallocr_t alloc,
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_cgraph * gb, struct ggml_cgraph * gb,
@ -590,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int n_tokens, const int n_tokens,
const int n_batch, const int n_batch,
const bool enable_flash_attn, const bool enable_flash_attn,
const bool enable_checkpointing) { const bool enable_checkpointing,
const bool measure_only) {
ggml_set_scratch(ctx, { 0, 0, nullptr, }); ggml_set_scratch(ctx, { 0, 0, nullptr, });
const int n_past = 0; const int n_past = 0;
@ -622,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// KQ_pos - contains the positions // KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
ggml_allocr_alloc(alloc, KQ_pos); ggml_set_input(KQ_pos);
if (!ggml_allocr_is_measure(alloc)) {
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// rope has so much parameters that we make a custom function for it // rope has so much parameters that we make a custom function for it
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@ -683,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b)); struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b)); struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b)); struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch);
struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch);
@ -732,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
cur = t30; cur = t30;
if (enable_checkpointing) { if (enable_checkpointing) {
@ -780,7 +707,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// input gradient // input gradient
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
ggml_allocr_alloc(alloc, t36->grad); ggml_set_input(t36->grad);
// KQ_pos // KQ_pos
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
@ -796,20 +723,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
} }
// allocating checkpoints in one block to reduce memory fragmentation // allocating checkpoints in one block to reduce memory fragmentation
// note: they will be freed in reverse order // note: they will be freed in reverse order
for (unsigned int i = 0; i < checkpoints.size(); ++i) { for (unsigned int i = 0; i < checkpoints.size(); ++i) {
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
ggml_allocr_alloc(alloc, checkpoints[i]); ggml_set_input(checkpoints[i]);
} }
} }
ggml_allocr_alloc_graph(alloc, gb); if (measure_only) {
ggml_gallocr_reserve(alloc, gb);
} else {
ggml_gallocr_alloc_graph(alloc, gb);
// set KQ_pos
{
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
}
// remove the additional nodes and leafs // remove the additional nodes and leafs
for (int i = n_leafs_before; i < gb->n_leafs; ++i) { for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
@ -859,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
init_lora(model, lora); init_lora(model, lora);
@ -886,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b));
copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
} }
} }
@ -929,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up);
gguf_add_tensor(fctx, lora->tok_embeddings_a); gguf_add_tensor(fctx, lora->tok_embeddings_a);
gguf_add_tensor(fctx, lora->tok_embeddings_b); gguf_add_tensor(fctx, lora->tok_embeddings_b);
@ -955,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
gguf_add_tensor(fctx, layer.wo_b); gguf_add_tensor(fctx, layer.wo_b);
gguf_add_tensor(fctx, layer.ffn_norm_a); gguf_add_tensor(fctx, layer.ffn_norm_a);
gguf_add_tensor(fctx, layer.ffn_norm_b); gguf_add_tensor(fctx, layer.ffn_norm_b);
gguf_add_tensor(fctx, layer.w1_a); gguf_add_tensor(fctx, layer.ffn_gate_a);
gguf_add_tensor(fctx, layer.w1_b); gguf_add_tensor(fctx, layer.ffn_gate_b);
gguf_add_tensor(fctx, layer.w2_a); gguf_add_tensor(fctx, layer.ffn_down_a);
gguf_add_tensor(fctx, layer.w2_b); gguf_add_tensor(fctx, layer.ffn_down_b);
gguf_add_tensor(fctx, layer.w3_a); gguf_add_tensor(fctx, layer.ffn_up_a);
gguf_add_tensor(fctx, layer.w3_b); gguf_add_tensor(fctx, layer.ffn_up_b);
} }
} }
@ -1165,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB"));
write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA"));
write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB"));
write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
} }
} }
@ -1200,9 +1139,9 @@ struct train_params {
uint32_t n_rank_wv; uint32_t n_rank_wv;
uint32_t n_rank_wo; uint32_t n_rank_wo;
uint32_t n_rank_ffn_norm; uint32_t n_rank_ffn_norm;
uint32_t n_rank_w1; uint32_t n_rank_ffn_gate;
uint32_t n_rank_w2; uint32_t n_rank_ffn_down;
uint32_t n_rank_w3; uint32_t n_rank_ffn_up;
uint32_t n_rank_tok_embeddings; uint32_t n_rank_tok_embeddings;
uint32_t n_rank_norm; uint32_t n_rank_norm;
uint32_t n_rank_output; uint32_t n_rank_output;
@ -1213,9 +1152,9 @@ struct train_params {
bool custom_n_rank_wv; bool custom_n_rank_wv;
bool custom_n_rank_wo; bool custom_n_rank_wo;
bool custom_n_rank_ffn_norm; bool custom_n_rank_ffn_norm;
bool custom_n_rank_w1; bool custom_n_rank_ffn_gate;
bool custom_n_rank_w2; bool custom_n_rank_ffn_down;
bool custom_n_rank_w3; bool custom_n_rank_ffn_up;
bool custom_n_rank_tok_embeddings; bool custom_n_rank_tok_embeddings;
bool custom_n_rank_norm; bool custom_n_rank_norm;
bool custom_n_rank_output; bool custom_n_rank_output;
@ -1247,9 +1186,9 @@ static struct train_params get_default_train_params() {
params.n_rank_wv = 4; params.n_rank_wv = 4;
params.n_rank_wo = 4; params.n_rank_wo = 4;
params.n_rank_ffn_norm = 1; params.n_rank_ffn_norm = 1;
params.n_rank_w1 = 4; params.n_rank_ffn_gate = 4;
params.n_rank_w2 = 4; params.n_rank_ffn_down = 4;
params.n_rank_w3 = 4; params.n_rank_ffn_up = 4;
params.n_rank_tok_embeddings = 4; params.n_rank_tok_embeddings = 4;
params.n_rank_norm = 1; params.n_rank_norm = 1;
params.n_rank_output = 4; params.n_rank_output = 4;
@ -1260,9 +1199,9 @@ static struct train_params get_default_train_params() {
params.custom_n_rank_wv = false; params.custom_n_rank_wv = false;
params.custom_n_rank_wo = false; params.custom_n_rank_wo = false;
params.custom_n_rank_ffn_norm = false; params.custom_n_rank_ffn_norm = false;
params.custom_n_rank_w1 = false; params.custom_n_rank_ffn_gate = false;
params.custom_n_rank_w2 = false; params.custom_n_rank_ffn_down = false;
params.custom_n_rank_w3 = false; params.custom_n_rank_ffn_up = false;
params.custom_n_rank_tok_embeddings = false; params.custom_n_rank_tok_embeddings = false;
params.custom_n_rank_norm = false; params.custom_n_rank_norm = false;
params.custom_n_rank_output = false; params.custom_n_rank_output = false;
@ -1293,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n");
fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n");
fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n");
fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n"); fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n");
fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n"); fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n");
fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n"); fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n");
print_common_train_usage(argc, argv, &params->common); print_common_train_usage(argc, argv, &params->common);
} }
@ -1430,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
} }
params->n_rank_wo = std::stoi(argv[i]); params->n_rank_wo = std::stoi(argv[i]);
params->custom_n_rank_wo = true; params->custom_n_rank_wo = true;
} else if (arg == "--rank-w1") { } else if (arg == "--rank-ffn_gate") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params->n_rank_w1 = std::stoi(argv[i]); params->n_rank_ffn_gate = std::stoi(argv[i]);
params->custom_n_rank_w1 = true; params->custom_n_rank_ffn_gate = true;
} else if (arg == "--rank-w2") { } else if (arg == "--rank-ffn_down") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params->n_rank_w2 = std::stoi(argv[i]); params->n_rank_ffn_down = std::stoi(argv[i]);
params->custom_n_rank_w2 = true; params->custom_n_rank_ffn_down = true;
} else if (arg == "--rank-w3") { } else if (arg == "--rank-ffn_up") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params->n_rank_w3 = std::stoi(argv[i]); params->n_rank_ffn_up = std::stoi(argv[i]);
params->custom_n_rank_w3 = true; params->custom_n_rank_ffn_up = true;
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
train_print_usage(argc, argv, &default_params); train_print_usage(argc, argv, &default_params);
@ -1513,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
nx += ggml_nelements(layer.wo_b); nx += ggml_nelements(layer.wo_b);
nx += ggml_nelements(layer.ffn_norm_a); nx += ggml_nelements(layer.ffn_norm_a);
nx += ggml_nelements(layer.ffn_norm_b); nx += ggml_nelements(layer.ffn_norm_b);
nx += ggml_nelements(layer.w1_a); nx += ggml_nelements(layer.ffn_gate_a);
nx += ggml_nelements(layer.w1_b); nx += ggml_nelements(layer.ffn_gate_b);
nx += ggml_nelements(layer.w2_a); nx += ggml_nelements(layer.ffn_down_a);
nx += ggml_nelements(layer.w2_b); nx += ggml_nelements(layer.ffn_down_b);
nx += ggml_nelements(layer.w3_a); nx += ggml_nelements(layer.ffn_up_a);
nx += ggml_nelements(layer.w3_b); nx += ggml_nelements(layer.ffn_up_b);
} }
return nx; return nx;
} }
@ -1572,9 +1511,9 @@ int main(int argc, char ** argv) {
uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r;
uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r;
uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1;
uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r; uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r;
uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r; uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r;
uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r; uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r;
uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1;
uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r;
@ -1584,15 +1523,15 @@ int main(int argc, char ** argv) {
lora.hparams.n_rank_wv = n_rank_wv; lora.hparams.n_rank_wv = n_rank_wv;
lora.hparams.n_rank_wo = n_rank_wo; lora.hparams.n_rank_wo = n_rank_wo;
lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm;
lora.hparams.n_rank_w1 = n_rank_w1; lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate;
lora.hparams.n_rank_w2 = n_rank_w2; lora.hparams.n_rank_ffn_down = n_rank_ffn_down;
lora.hparams.n_rank_w3 = n_rank_w3; lora.hparams.n_rank_ffn_up = n_rank_ffn_up;
lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
lora.hparams.n_rank_norm = n_rank_norm; lora.hparams.n_rank_norm = n_rank_norm;
lora.hparams.n_rank_output = n_rank_output; lora.hparams.n_rank_output = n_rank_output;
// set opt params from command line // set opt params from command line
opt->params = ggml_opt_default_params(GGML_OPT_ADAM); opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
opt->params.print_forward_graph = false; opt->params.print_forward_graph = false;
opt->params.print_backward_graph = false; opt->params.print_backward_graph = false;
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
@ -1627,9 +1566,9 @@ int main(int argc, char ** argv) {
|| (lora.hparams.n_rank_wv != n_rank_wv) || (lora.hparams.n_rank_wv != n_rank_wv)
|| (lora.hparams.n_rank_wo != n_rank_wo) || (lora.hparams.n_rank_wo != n_rank_wo)
|| (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm)
|| (lora.hparams.n_rank_w1 != n_rank_w1) || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate)
|| (lora.hparams.n_rank_w2 != n_rank_w2) || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down)
|| (lora.hparams.n_rank_w3 != n_rank_w3) || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up)
|| (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
|| (lora.hparams.n_rank_norm != n_rank_norm) || (lora.hparams.n_rank_norm != n_rank_norm)
|| (lora.hparams.n_rank_output != n_rank_output) || (lora.hparams.n_rank_output != n_rank_output)
@ -1663,7 +1602,7 @@ int main(int argc, char ** argv) {
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f)); printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
if (params.only_write_lora) { if (params.only_write_lora) {
save_train_files_data save_data; save_train_files_data save_data;
@ -1690,10 +1629,6 @@ int main(int argc, char ** argv) {
int n_vocab = model.hparams.n_vocab; int n_vocab = model.hparams.n_vocab;
int n_batch = params.common.n_batch; int n_batch = params.common.n_batch;
std::vector<uint8_t> mem_input_data;
std::vector<uint8_t> mem_compute_data;
// context for input tensors without their data // context for input tensors without their data
struct ggml_init_params ctx_input_params = { struct ggml_init_params ctx_input_params = {
ggml_tensor_overhead() * 2, // mem_size ggml_tensor_overhead() * 2, // mem_size
@ -1706,17 +1641,11 @@ int main(int argc, char ** argv) {
struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
// measure required memory for input tensors
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
tensor_alignment;
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors // allocate input tensors
mem_input_data.resize(max_input_size); // measure required memory for input tensors
ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
ggml_allocr_alloc(alloc_inps, tokens_input); size_t max_input_size = ggml_backend_buffer_get_size(input_data);
ggml_allocr_alloc(alloc_inps, target_probs); printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// context for compute tensors without their data // context for compute tensors without their data
const size_t estimated_compute_size_wo_data = ( const size_t estimated_compute_size_wo_data = (
@ -1743,7 +1672,7 @@ int main(int argc, char ** argv) {
// find best evaluation order // find best evaluation order
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = (enum ggml_cgraph_eval_order) order; gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1756,14 +1685,15 @@ int main(int argc, char ** argv) {
&logits, tokens_input, target_probs, &logits, tokens_input, target_probs,
n_tokens, n_batch, n_tokens, n_batch,
params.common.use_flash, params.common.use_flash,
params.common.use_checkpointing params.common.use_checkpointing,
true
); );
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
if (max_compute_size < best_compute_size) { if (max_compute_size < best_compute_size) {
best_compute_size = max_compute_size; best_compute_size = max_compute_size;
best_order = gf->order; best_order = gf->order;
} }
ggml_allocr_free(alloc); ggml_gallocr_free(alloc);
ggml_free(ctx_compute); ggml_free(ctx_compute);
} }
size_t max_compute_size = best_compute_size; size_t max_compute_size = best_compute_size;
@ -1774,9 +1704,8 @@ int main(int argc, char ** argv) {
"invalid"); "invalid");
// allocate compute tensors // allocate compute tensors
mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = best_order; gf->order = best_order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1789,11 +1718,9 @@ int main(int argc, char ** argv) {
&logits, tokens_input, target_probs, &logits, tokens_input, target_probs,
n_tokens, n_batch, n_tokens, n_batch,
params.common.use_flash, params.common.use_flash,
params.common.use_checkpointing params.common.use_checkpointing,
false
); );
ggml_allocr_free(alloc);
ggml_allocr_free(alloc_inps);
// tokenize data // tokenize data
std::vector<llama_token> train_tokens; std::vector<llama_token> train_tokens;
@ -1908,6 +1835,8 @@ int main(int argc, char ** argv) {
ggml_free(ctx_work); ggml_free(ctx_work);
ggml_free(ctx_compute); ggml_free(ctx_compute);
ggml_free(ctx_input); ggml_free(ctx_input);
ggml_gallocr_free(alloc);
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
printf("%s: total training time: ", __func__); printf("%s: total training time: ", __func__);

View file

@ -568,7 +568,8 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model_params mparams = llama_model_params_from_gpt_params(params); llama_model_params mparams = llama_model_params_from_gpt_params(params);

View file

@ -202,7 +202,8 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
LOG("%s: llama backend init\n", __func__); LOG("%s: llama backend init\n", __func__);
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -87,7 +87,21 @@ class SchemaConverter:
elif schema_type == 'array' and 'items' in schema: elif schema_type == 'array' and 'items' in schema:
# TODO `prefixItems` keyword # TODO `prefixItems` keyword
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item') item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' list_item_operator = f'("," space {item_rule_name})'
successive_items = ""
min_items = schema.get("minItems", 0)
if min_items > 0:
first_item = f"({item_rule_name})"
successive_items = list_item_operator * (min_items - 1)
min_items -= 1
else:
first_item = f"({item_rule_name})?"
max_items = schema.get("maxItems")
if max_items is not None and max_items > min_items:
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
else:
successive_items += list_item_operator + "*"
rule = f'"[" space {first_item} {successive_items} "]" space'
return self._add_rule(rule_name, rule) return self._add_rule(rule_name, rule)
else: else:

View file

@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) {
static const char * split_mode_str(llama_split_mode mode) { static const char * split_mode_str(llama_split_mode mode) {
switch (mode) { switch (mode) {
case LLAMA_SPLIT_NONE: return "none"; case LLAMA_SPLIT_MODE_NONE: return "none";
case LLAMA_SPLIT_LAYER: return "layer"; case LLAMA_SPLIT_MODE_LAYER: return "layer";
case LLAMA_SPLIT_ROW: return "row"; case LLAMA_SPLIT_MODE_ROW: return "row";
default: GGML_ASSERT(!"invalid split mode"); default: GGML_ASSERT(!"invalid split mode");
} }
} }
@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = {
/* type_v */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_num_physical_cores()}, /* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99}, /* n_gpu_layers */ {99},
/* split_mode */ {LLAMA_SPLIT_LAYER}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0}, /* main_gpu */ {0},
/* no_kv_offload */ {false}, /* no_kv_offload */ {false},
/* mul_mat_q */ {true}, /* mul_mat_q */ {true},
@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
for (const auto & m : p) { for (const auto & m : p) {
llama_split_mode mode; llama_split_mode mode;
if (m == "none") { if (m == "none") {
mode = LLAMA_SPLIT_NONE; mode = LLAMA_SPLIT_MODE_NONE;
} else if (m == "layer") { } else if (m == "layer") {
mode = LLAMA_SPLIT_LAYER; mode = LLAMA_SPLIT_MODE_LAYER;
} else if (m == "row") { } else if (m == "row") {
mode = LLAMA_SPLIT_ROW; mode = LLAMA_SPLIT_MODE_ROW;
} else { } else {
invalid_param = true; invalid_param = true;
break; break;
@ -1151,8 +1151,7 @@ int main(int argc, char ** argv) {
if (!params.verbose) { if (!params.verbose) {
llama_log_set(llama_null_log_callback, NULL); llama_log_set(llama_null_log_callback, NULL);
} }
bool numa = false; llama_backend_init();
llama_backend_init(numa);
// initialize printer // initialize printer
std::unique_ptr<printer> p; std::unique_ptr<printer> p;

View file

@ -274,8 +274,8 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) { Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
llama_backend_init(numa); llama_backend_init();
} }
extern "C" extern "C"

View file

@ -51,7 +51,7 @@ actor LlamaContext {
} }
static func create_context(path: String) throws -> LlamaContext { static func create_context(path: String) throws -> LlamaContext {
llama_backend_init(false) llama_backend_init()
var model_params = llama_model_default_params() var model_params = llama_model_default_params()
#if targetEnvironment(simulator) #if targetEnvironment(simulator)

View file

@ -1,10 +1,12 @@
# LLaVA # LLaVA
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants. Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
models are available. models are available.
For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
After API is confirmed, more models will be supported / uploaded. After API is confirmed, more models will be supported / uploaded.
@ -18,10 +20,11 @@ After building, run: `./llava-cli` to see the usage. For example:
``` ```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
## Model conversion ## LLaVA 1.5
- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally: - Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh ```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
@ -50,13 +53,79 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: 5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
```sh ```sh
python ./convert.py ../llava-v1.5-7b python ./convert.py ../llava-v1.5-7b --skip-unknown
``` ```
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
## LLaVA 1.6 gguf conversion
1) First clone a LLaVA 1.6 model:
```console
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
```
2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
```
- you will find a llava.projector and a llava.clip file in your model directory
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
```console
mkdir vit
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
cp ../llava-v1.6-vicuna-7b/llava.projector vit/
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
```
4) Create the visual gguf model:
```console
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
5) Then convert the model to gguf format:
```console
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
6) And finally we can run the llava-cli using the 1.6 model version:
```console
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
```
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
## llava-cli templating and llava-1.6 prompting
llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
**For Mistral and using llava-cli binary:**
Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
**For the 34B this should work:**
Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
## How to know if you are running in llava-1.5 or llava-1.6 mode
When running llava-cli you will see a visual information right before the prompt is being processed:
**Llava-1.5:**
`encode_image_with_clip: image embedding created: 576 tokens`
**Llava-1.6 (anything above 576):**
`encode_image_with_clip: image embedding created: 2880 tokens`
Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
## TODO ## TODO
- [ ] Support non-CPU backend for the image encoding part. - [x] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods. - [ ] Support different sampling methods.
- [ ] Support more model variants. - [ ] Support more model variants.

File diff suppressed because it is too large Load diff

View file

@ -24,25 +24,7 @@ struct clip_ctx;
extern "C" { extern "C" {
#endif #endif
struct clip_vision_hparams { struct clip_ctx;
int32_t image_size;
int32_t patch_size;
int32_t hidden_size;
int32_t n_intermediate;
int32_t projection_dim;
int32_t n_head;
int32_t n_layer;
float eps;
};
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
struct clip_image_u8_batch { struct clip_image_u8_batch {
struct clip_image_u8 * data; struct clip_image_u8 * data;
@ -54,18 +36,43 @@ struct clip_image_f32_batch {
size_t size; size_t size;
}; };
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
// TODO: should be enum, not string
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square); /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);

View file

@ -71,25 +71,26 @@ def bytes_to_unicode():
return dict(zip(bs, cs)) return dict(zip(bs, cs))
ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py") ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
ap.add_argument("--text-only", action="store_true", required=False, ap.add_argument("--text-only", action="store_true", required=False,
help="Save a text-only model. It can't be used to encode images") help="Save a text-only model. It can't be used to encode images")
ap.add_argument("--vision-only", action="store_true", required=False, ap.add_argument("--vision-only", action="store_true", required=False,
help="Save a vision-only model. It can't be used to encode texts") help="Save a vision-only model. It can't be used to encode texts")
ap.add_argument("--clip_model_is_vision", action="store_true", required=False, ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
help="The clip model is from openclip (for ViT-SO400M type))")
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
default_image_mean = [0.48145466, 0.4578275, 0.40821073] default_image_mean = [0.48145466, 0.4578275, 0.40821073]
default_image_std = [0.26862954, 0.26130258, 0.27577711] default_image_std = [0.26862954, 0.26130258, 0.27577711]
ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
# with proper # with proper
args = ap.parse_args() args = ap.parse_args()
@ -105,7 +106,7 @@ if args.use_f32:
# output in the same directory as the model if output_dir is None # output in the same directory as the model if output_dir is None
dir_model = args.model_dir dir_model = args.model_dir
if args.clip_model_is_vision: if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
vocab = None vocab = None
tokens = None tokens = None
else: else:
@ -133,7 +134,7 @@ ftype = 1
if args.use_f32: if args.use_f32:
ftype = 0 ftype = 0
if args.clip_model_is_vision: if args.clip_model_is_vision or args.clip_model_is_openclip:
model = CLIPVisionModel.from_pretrained(dir_model) model = CLIPVisionModel.from_pretrained(dir_model)
processor = None processor = None
else: else:
@ -202,6 +203,57 @@ if has_vision_encoder:
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
# /**
# "image_grid_pinpoints": [
# [
# 336,
# 672
# ],
# [
# 672,
# 336
# ],
# [
# 672,
# 672
# ],
# [
# 1008,
# 336
# ],
# [
# 336,
# 1008
# ]
# ],
# Flattened:
# [
# 336, 672,
# 672, 336,
# 672, 672,
# 1008, 336,
# 336, 1008
# ]
# *
# */
if "image_grid_pinpoints" in v_hparams:
# flatten it
image_grid_pinpoints = []
for pinpoint in v_hparams["image_grid_pinpoints"]:
for p in pinpoint:
image_grid_pinpoints.append(p)
fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
if "image_crop_resolution" in v_hparams:
fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
if "image_aspect_ratio" in v_hparams:
fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
if "image_split_resolution" in v_hparams:
fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
if "mm_patch_merge_type" in v_hparams:
fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
if "mm_projector_type" in v_hparams:
fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
if processor is not None: if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean

View file

@ -155,11 +155,29 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
system_prompt = prompt.substr(0, image_pos); system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length()); user_prompt = prompt.substr(image_pos + std::string("<image>").length());
printf("system_prompt: %s\n", system_prompt.c_str()); printf("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
printf("user_prompt: %s\n", user_prompt.c_str()); printf("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} else { } else {
// llava-1.5 native mode // llava-1.5 native mode
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:"; user_prompt = prompt + "\nASSISTANT:";
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} }
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
@ -171,13 +189,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
fprintf(stderr, "\n"); fprintf(stderr, "\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
std::string response = "";
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break; if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp); printf("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
fflush(stdout); fflush(stdout);
} }
@ -196,7 +218,8 @@ static struct llava_context * llava_init(gpt_params * params) {
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_backend_init(params->numa); llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = llama_model_params_from_gpt_params(*params); llama_model_params model_params = llama_model_params_from_gpt_params(*params);

View file

@ -0,0 +1,155 @@
import argparse
import glob
import os
import torch
from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
# Function to determine if file is a SafeTensor file
def is_safetensor_file(file_path):
return file_path.endswith('.safetensors')
# Unified loading function
def load_model(file_path):
if is_safetensor_file(file_path):
tensors = {}
with safe_open(file_path, framework="pt", device="cpu") as f:
for key in f.keys():
tensors[key] = f.get_tensor(key).clone()
# output shape
print(f"{key} : {tensors[key].shape}")
return tensors, 'safetensor'
else:
return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
# Unified saving function
def save_model(model, file_path, file_type):
if file_type == 'safetensor':
# safe_save(model, file_path)
save_file(model, file_path)
else:
torch.save(model, file_path)
# Adapted function to clean vision tower from checkpoint
def clean_vision_tower_from_checkpoint(checkpoint_path):
checkpoint, file_type = load_model(checkpoint_path)
# file_type = 'pytorch'
model_path = os.path.dirname(checkpoint_path)
print(f"Searching for vision tower tensors in {checkpoint_path}")
clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]
if len(clip_tensors) > 0:
print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
# Adapted for file type
clip_path = os.path.join(model_path, "llava.clip")
if os.path.exists(clip_path):
print(f"Loading existing llava.clip from {clip_path}")
existing_clip, _ = load_model(clip_path)
else:
print(f"Creating new llava.clip at {clip_path}")
existing_clip = {}
# Update existing_clip with new tensors, avoid duplicates
for name in clip_tensors:
simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
print(f"Adding {simple_name} to llava.clip")
if simple_name not in existing_clip:
existing_clip[simple_name] = checkpoint[name]
# Save the updated clip tensors back to llava.clip
save_model(existing_clip, clip_path, 'pytorch')
# Remove the tensors from the original checkpoint
for name in clip_tensors:
del checkpoint[name]
checkpoint_path = checkpoint_path
return True
return False
def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
newline_checkpoint_path = None
projector_checkpoint_path = None
for path in checkpoint_paths:
checkpoint, _ = load_model(path)
if newline_criteria(checkpoint) and newline_checkpoint_path is None:
newline_checkpoint_path = path
if projector(checkpoint):
projector_checkpoint_path = path
return newline_checkpoint_path, projector_checkpoint_path
def newline_criteria(checkpoint):
return any(k.startswith("model.image_newline") for k in checkpoint.keys())
def proj_criteria(checkpoint):
return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
# Command-line interface setup
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
args = ap.parse_args()
if args.clean_vision_tower:
# Generalized to handle both PyTorch and SafeTensors models
model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
# checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
for projector_checkpoint_path in checkpoint_paths:
print(f"Cleaning {projector_checkpoint_path}")
if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
print(f"No vision tower found in {projector_checkpoint_path}")
# we break once none is found, so far all models append them at the end
# break
print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
# Now we look for the projector in the last checkpoint
model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
# last_checkpoint_path = checkpoint_paths[0]
# first_checkpoint_path = checkpoint_paths[-1]
newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
print(f"Taking projector from {projector_checkpoint_path}")
first_mm_tensors = []
first_checkpoint = None
if newline_checkpoint_path is not None:
print(f"Taking newline from {newline_checkpoint_path}")
first_checkpoint, file_type = load_model(newline_checkpoint_path)
first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
# Load the checkpoint
mm_tensors = []
last_checkpoint = None
if projector_checkpoint_path is not None:
last_checkpoint, file_type = load_model(projector_checkpoint_path)
mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
if len(mm_tensors) == 0:
if last_checkpoint is not None:
for k, v in last_checkpoint.items():
print(k)
print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
print("No tensors found. Is this a LLaVA model?")
exit()
print(f"Found {len(mm_tensors)} tensors to extract.")
print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
# projector = {name: checkpoint.[name].float() for name in mm_tensors}
projector = {}
for name in mm_tensors:
projector[name] = last_checkpoint[name].float()
for name in first_mm_tensors:
projector[name] = first_checkpoint[name].float()
if len(projector) > 0:
save_model(projector, f"{args.model}/llava.projector", 'pytorch')
print("Done!")
print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")

View file

@ -19,19 +19,12 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_project
projector = {name: checkpoint[name].float() for name in mm_tensors} projector = {name: checkpoint[name].float() for name in mm_tensors}
torch.save(projector, f"{args.model}/llava.projector") torch.save(projector, f"{args.model}/llava.projector")
# remove these tensors from the checkpoint and save it again
for name in mm_tensors:
del checkpoint[name]
# BakLLaVA models contain CLIP tensors in it # BakLLaVA models contain CLIP tensors in it
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")] clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
if len(clip_tensors) > 0: if len(clip_tensors) > 0:
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors} clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
torch.save(clip, f"{args.model}/llava.clip") torch.save(clip, f"{args.model}/llava.clip")
# remove these tensors
for name in clip_tensors:
del checkpoint[name]
# added tokens should be removed to be able to convert Mistral models # added tokens should be removed to be able to convert Mistral models
if os.path.exists(f"{args.model}/added_tokens.json"): if os.path.exists(f"{args.model}/added_tokens.json"):
@ -39,7 +32,6 @@ if len(clip_tensors) > 0:
f.write("{}\n") f.write("{}\n")
torch.save(checkpoint, path)
print("Done!") print("Done!")
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")

View file

@ -2,32 +2,296 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "llava.h" #include "llava.h"
#include "base64.hpp"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <vector> #include <vector>
#include <numeric>
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
struct clip_image_grid_shape {
int first;
int second;
};
/**
* Selects the best resolution from a list of possible resolutions based on the original size.
*
* @param original_size The original size of the image in the format (width, height).
* @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
* @return The best fit resolution in the format (width, height).
*/
static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
int original_width = original_size.first;
int original_height = original_size.second;
std::pair<int, int> best_fit;
int max_effective_resolution = 0;
int min_wasted_resolution = std::numeric_limits<int>::max();
for (const auto& resolution : possible_resolutions) {
int width = resolution.first;
int height = resolution.second;
float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
int downscaled_width = static_cast<int>(original_width * scale);
int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution;
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution;
best_fit = resolution;
}
}
return best_fit;
}
/**
* @brief Get the anyres image grid shape object
*
* @param image_size
* @param grid_pinpoints
* @param image_patch_size
* @return <int, int>
*/
static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
/**
Conversion from gguf flat array to vector:
std::vector<std::pair<int, int>> possible_resolutions;
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
}
*/
auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
}
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
struct {
struct ggml_tensor * newline;
struct ggml_context * ctx;
} model;
const int32_t image_size = clip_image_size(ctx_clip);
const int32_t patch_size = clip_patch_size(ctx_clip);
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
int num_patches_width = grid_shape.first; // grid 1-4
int num_patches_height = grid_shape.second; // grid 1-4
const size_t num_images = num_patches_width * num_patches_height + 1;
// TODO: size calculation is not calculated - it's only tens of MB
size_t ctx_size = 0;
{
ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
}
struct ggml_init_params params {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
};
// Python reference code for full unpad:
/*
base_image_feature = image_feature[0]
image_feature = image_feature[1:]
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
image_feature = unpad_image(image_feature, image_sizes[image_idx])
image_feature = torch.cat((
image_feature,
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
), dim=-1)
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
*/
// We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
// In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
// Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
// Once all images are processed to prepended the base_image_features without any changes.
// Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
/*
image_feature = image_feature.view(2, 2, 24, 24, 4096)
image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
image_feature = image_feature.view(2, 24, 2, 24, 4096)
image_feature = image_feature.flatten(0, 3)
// Reshape to 4D tensor by merging the last two dimensions
image_feature = image_feature.view(2, 2, 24, 24*4096)
image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
image_feature = image_feature.view(-1, 4096)
*/
model.ctx = ggml_init(params);
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
if (newline_tmp->buffer == NULL) {
printf("newline_tmp tensor buffer is NULL\n");
}
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
} else {
model.newline->data = newline_tmp->data;
if (model.newline->data == NULL) {
printf("newline_tmp tensor data is NULL\n");
}
}
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
for (size_t i = 1; i < num_images; i++) {
size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
}
struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
size_t size_ele = ggml_type_size(GGML_TYPE_F32);
struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
num_patches_per_side,
num_patches_width,
num_patches_height,
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
// ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
/**
At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
image_feature = torch.cat((
image_feature,
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
), dim=-1)
*
*/
// ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
// Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python
// However, permuted embeddings do not work yet (stride issue?)
// memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
// memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
// *n_img_pos_out=576;
ggml_free(model.ctx);
return true;
}
#include "base64.hpp"
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
clip_image_f32 * img_res = clip_image_f32_init(); // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) { clip_image_f32_batch img_res_v;
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__); fprintf(stderr, "%s: unable to preprocess image\n", __func__);
clip_image_f32_free(img_res); delete[] img_res_v.data;
return false; return false;
} }
*n_img_pos = clip_n_patches(ctx_clip);
const int64_t t_img_enc_start_us = ggml_time_us(); const int64_t t_img_enc_start_us = ggml_time_us();
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
clip_image_f32_free(img_res);
if (!encoded) {
fprintf(stderr, "Unable to encode image\n");
return false; const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
if (!encoded) {
fprintf(stderr, "Unable to encode image\n");
return false;
}
} else {
// spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
for (size_t i = 0; i < img_res_v.size; i++) {
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) {
fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip);
std::vector<std::pair<int, int>> grid_pinpoints;
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
}
// free all img_res_v - not needed anymore
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
const int32_t image_size = clip_image_size(ctx_clip);
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]);
}
image_embd_v.clear();
// debug image/segment/normalization content:
// clip_image_u8 * tmp = clip_image_u8_init();
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
} }
printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us(); const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
@ -47,11 +311,10 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
return true; return true;
} }
static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)); float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
if (!image_embd) { if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n"); fprintf(stderr, "Unable to allocate memory for image embeddings\n");
free(image_embd);
return false; return false;
} }
@ -85,7 +348,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
return true; return true;
} }
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
clip_image_u8 * img = clip_image_u8_init(); clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img); clip_image_u8_free(img);
@ -142,7 +405,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
return true; return true;
} }
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
unsigned char* image_bytes; unsigned char* image_bytes;
long image_bytes_length; long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
@ -151,13 +414,13 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
return NULL; return NULL;
} }
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
free(image_bytes); free(image_bytes);
return embed; return embed;
} }
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) { void llava_image_embed_free(struct llava_image_embed * embed) {
free(embed->embed); free(embed->embed);
free(embed); free(embed);
} }

View file

@ -3,7 +3,6 @@
#include "ggml.h" #include "ggml.h"
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__) # if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD # ifdef LLAMA_BUILD
@ -32,6 +31,8 @@ struct llava_image_embed {
/** sanity check for clip <-> llava embed size match */ /** sanity check for clip <-> llava embed size match */
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
/** build an image embed from image file bytes */ /** build an image embed from image file bytes */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
/** build an image embed from a path to an image filename */ /** build an image embed from a path to an image filename */
@ -42,7 +43,6 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -54,7 +54,8 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = NULL; llama_model * model = NULL;
llama_context * ctx = NULL; llama_context * ctx = NULL;

View file

@ -1,7 +1,9 @@
#include "common.h" #include "common.h"
#include "ggml.h"
#include "llama.h" #include "llama.h"
#include <cmath> #include <cmath>
#include <cstdint>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
@ -29,7 +31,8 @@ int main(int argc, char ** argv){
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = NULL; llama_model * model = NULL;
llama_context * ctx = NULL; llama_context * ctx = NULL;
@ -73,6 +76,8 @@ int main(int argc, char ** argv){
int n_drafted = 0; int n_drafted = 0;
int n_accept = 0; int n_accept = 0;
int64_t t_draft_us = 0;
int n_past = inp.size(); int n_past = inp.size();
bool has_eos = false; bool has_eos = false;
@ -160,7 +165,7 @@ int main(int argc, char ** argv){
// generate n_pred tokens through prompt lookup // generate n_pred tokens through prompt lookup
auto prompt_lookup = [&]() -> void { auto prompt_lookup = [&]() -> void {
int inp_size = inp.size(); const int inp_size = inp.size();
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){ for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
const llama_token * ngram = &inp[inp_size - ngram_size]; const llama_token * ngram = &inp[inp_size - ngram_size];
@ -191,8 +196,12 @@ int main(int argc, char ** argv){
return; return;
}; };
const int64_t t_start_draft_us = ggml_time_us();
prompt_lookup(); prompt_lookup();
t_draft_us += ggml_time_us() - t_start_draft_us;
llama_decode(ctx, batch_tgt); llama_decode(ctx, batch_tgt);
++n_past; ++n_past;
@ -210,6 +219,8 @@ int main(int argc, char ** argv){
LOG_TEE("n_draft = %d\n", n_draft); LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict); LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted); LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);

View file

@ -283,7 +283,11 @@ These options help improve the performance and memory usage of the LLaMA models.
### NUMA support ### NUMA support
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root. - `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
- `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
### Memory Float 32 ### Memory Float 32

View file

@ -98,7 +98,7 @@ static void write_logfile(
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void sigint_handler(int signo) { static void sigint_handler(int signo) {
if (signo == SIGINT) { if (signo == SIGINT) {
if (!is_interacting) { if (!is_interacting && g_params->interactive) {
is_interacting = true; is_interacting = true;
} else { } else {
console::cleanup(); console::cleanup();
@ -185,7 +185,8 @@ int main(int argc, char ** argv) {
} }
LOG("%s: llama backend init\n", __func__); LOG("%s: llama backend init\n", __func__);
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;
@ -333,6 +334,8 @@ int main(int argc, char ** argv) {
// number of tokens to keep when resetting context // number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
params.n_keep = (int)embd_inp.size(); params.n_keep = (int)embd_inp.size();
} else {
params.n_keep += add_bos; // always keep the BOS token
} }
// prefix & suffix for instruct mode // prefix & suffix for instruct mode
@ -382,8 +385,8 @@ int main(int argc, char ** argv) {
} }
} }
if (params.n_keep > 0) { if (params.n_keep > add_bos) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__); LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
@ -392,7 +395,8 @@ int main(int argc, char ** argv) {
LOG_TEE("\n"); LOG_TEE("\n");
} }
if (params.interactive) { // ctrl+C handling
{
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action; struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler; sigint_action.sa_handler = sigint_handler;
@ -405,7 +409,9 @@ int main(int argc, char ** argv) {
}; };
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif #endif
}
if (params.interactive) {
LOG_TEE("%s: interactive mode on.\n", __func__); LOG_TEE("%s: interactive mode on.\n", __func__);
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
@ -536,14 +542,14 @@ int main(int argc, char ** argv) {
break; break;
} }
const int n_left = n_past - params.n_keep - 1; const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2; const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard); n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard; n_past -= n_discard;

View file

@ -122,7 +122,8 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = NULL; llama_model * model = NULL;
llama_context * ctx = NULL; llama_context * ctx = NULL;

View file

@ -71,7 +71,8 @@ int main(int argc, char ** argv) {
// init LLM // init LLM
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
// initialize the model // initialize the model

View file

@ -309,7 +309,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
} }
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
@ -447,7 +447,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
return perplexity_v2(ctx, params); return perplexity_v2(ctx, params);
} }
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
@ -1623,7 +1623,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
uint32_t n_ctx; uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx)); in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) { if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n", fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx); __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
} }
@ -1809,7 +1809,8 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -23,14 +23,18 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , }, { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.25 bpw non-linear quantization", },
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
@ -237,7 +241,7 @@ int main(int argc, char ** argv) {
params.imatrix = &imatrix_data; params.imatrix = &imatrix_data;
} }
llama_backend_init(false); llama_backend_init();
// parse command line arguments // parse command line arguments
const std::string fname_inp = argv[arg_idx]; const std::string fname_inp = argv[arg_idx];
@ -287,9 +291,10 @@ int main(int argc, char ** argv) {
} }
} }
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) { if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
fprintf(stderr, "\n===============================================================================================\n"); fprintf(stderr, "\n===============================================================================================\n");
fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
fprintf(stderr, "===============================================================================================\n\n\n"); fprintf(stderr, "===============================================================================================\n\n\n");
return 1; return 1;
} }

View file

@ -16,6 +16,13 @@ Command line options:
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa STRATEGY`: Attempt one of the below optimization strategies that help on some NUMA systems
- `--numa distribute`: Spread execution evenly over all nodes
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa`: Attempt optimizations that help on some NUMA systems. - `--numa`: Attempt optimizations that help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
@ -32,6 +39,12 @@ Command line options:
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, default: enabled.
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
## Build ## Build
@ -88,6 +101,12 @@ curl --request POST \
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
``` ```
## Advanced testing
We implemented a [server test framework](./tests/README.md) using human-readable scenario.
*Before submitting an issue, please try to reproduce it with this format.*
## Node JS Test ## Node JS Test
You need to have [Node.js](https://nodejs.org/en) installed. You need to have [Node.js](https://nodejs.org/en) installed.
@ -125,9 +144,13 @@ node index.js
## API Endpoints ## API Endpoints
- **GET** `/health`: Returns the current state of the server: - **GET** `/health`: Returns the current state of the server:
- `{"status": "loading model"}` if the model is still being loaded. - 503 -> `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load. - 500 -> `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
@ -137,7 +160,7 @@ node index.js
`temperature`: Adjust the randomness of the generated text (default: 0.8). `temperature`: Adjust the randomness of the generated text (default: 0.8).
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled). `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0). `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
@ -185,18 +208,22 @@ node index.js
`ignore_eos`: Ignore end of stream token and continue generating (default: false). `ignore_eos`: Ignore end of stream token and continue generating (default: false).
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1) `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false) `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values)
### Result JSON ### Result JSON
- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. - Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@ -224,7 +251,7 @@ Notice that each `probs` is an array of length `n_probs`.
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model` - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
- `model`: The path to the model loaded with `-m` - `model`: The path to the model loaded with `-m`
- `prompt`: The provided `prompt` - `prompt`: The provided `prompt`
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
@ -370,6 +397,81 @@ Notice that each `probs` is an array of length `n_probs`.
}' }'
``` ```
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
### Result JSON
```json
[
{
"dynatemp_exponent": 1.0,
"dynatemp_range": 0.0,
"frequency_penalty": 0.0,
"grammar": "",
"id": 0,
"ignore_eos": false,
"logit_bias": [],
"min_p": 0.05000000074505806,
"mirostat": 0,
"mirostat_eta": 0.10000000149011612,
"mirostat_tau": 5.0,
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
"n_ctx": 2048,
"n_keep": 0,
"n_predict": 100000,
"n_probs": 0,
"next_token": {
"has_next_token": true,
"n_remain": -1,
"num_tokens_predicted": 0,
"stopped_eos": false,
"stopped_limit": false,
"stopped_word": false,
"stopping_word": ""
},
"penalize_nl": true,
"penalty_prompt_tokens": [],
"presence_penalty": 0.0,
"prompt": "Say hello to llama.cpp",
"repeat_last_n": 64,
"repeat_penalty": 1.100000023841858,
"samplers": [
"top_k",
"tfs_z",
"typical_p",
"top_p",
"min_p",
"temperature"
],
"seed": 42,
"state": 1,
"stop": [
"\n"
],
"stream": false,
"task_id": 0,
"temperature": 0.0,
"tfs_z": 1.0,
"top_k": 40,
"top_p": 0.949999988079071,
"typical_p": 1.0,
"use_penalty_prompt_tokens": false
}
]
```
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
Available metrics:
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
- `llamacpp:requests_processing`: Number of request processing.
- `llamacpp:requests_deferred`: Number of request deferred.
## More examples ## More examples
### Change system prompt on runtime ### Change system prompt on runtime

View file

@ -15,7 +15,9 @@
using json = nlohmann::json; using json = nlohmann::json;
inline static json oaicompat_completion_params_parse( inline static json oaicompat_completion_params_parse(
const json &body /* openai api json semantics */) const struct llama_model * model,
const json &body, /* openai api json semantics */
const std::string &chat_template)
{ {
json llama_params; json llama_params;
@ -30,7 +32,7 @@ inline static json oaicompat_completion_params_parse(
// https://platform.openai.com/docs/api-reference/chat/create // https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams; llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("unknown")); llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0); llama_params["temperature"] = json_value(body, "temperature", 0.0);
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);

View file

@ -234,6 +234,7 @@
mirostat_eta: 0.1, // learning rate mirostat_eta: 0.1, // learning rate
grammar: '', grammar: '',
n_probs: 0, // no completion_probabilities, n_probs: 0, // no completion_probabilities,
min_keep: 0, // min probs from each sampler,
image_data: [], image_data: [],
cache_prompt: true, cache_prompt: true,
api_key: '' api_key: ''
@ -791,6 +792,9 @@
<fieldset> <fieldset>
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })} ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
</fieldset> </fieldset>
<fieldset>
${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
</fieldset>
<fieldset> <fieldset>
<label for="api_key">API Key</label> <label for="api_key">API Key</label>
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} /> <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,47 @@
# Server tests
Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/):
* [issues.feature](./features/issues.feature) Pending issues scenario
* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
* [security.feature](./features/security.feature) Security, CORS and API Key
* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
Tests target GitHub workflows job runners with 4 vCPU.
Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client.
Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`.
### Install dependencies
`pip install -r requirements.txt`
### Run tests
1. Build the server
```shell
cd ../../..
mkdir build
cd build
cmake ../
cmake --build . --target server
```
2. download required models:
1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf`
3. Start the test: `./tests.sh`
It's possible to override some scenario steps values with environment variables:
- `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
- `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
- `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
- `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
### Run @bug, @wip or @wrong_usage annotated scenario
Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
- `@bug` annotation aims to link a scenario with a GitHub issue.
- `@wrong_usage` are meant to show user issue that are actually an expected behavior
- `@wip` to focus on a scenario working in progress
To run a scenario annotated with `@bug`, start:
`DEBUG=ON ./tests.sh --no-skipped --tags bug`
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.

View file

@ -0,0 +1,69 @@
import os
import socket
import subprocess
import time
from contextlib import closing
from signal import SIGKILL
def before_scenario(context, scenario):
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
port = 8080
if 'PORT' in os.environ:
port = int(os.environ['PORT'])
if is_server_listening("localhost", port):
assert False, "Server already started"
def after_scenario(context, scenario):
if context.server_process is None:
return
if scenario.status == "failed":
if 'GITHUB_ACTIONS' in os.environ:
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
if os.path.isfile('llama.log'):
with closing(open('llama.log', 'r')) as f:
for line in f:
print(line)
if not is_server_listening(context.server_fqdn, context.server_port):
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
if not pid_exists(context.server_process.pid):
assert False, f"Server not running pid={context.server_process.pid} ..."
print(f"stopping server pid={context.server_process.pid} ...")
context.server_process.kill()
# Wait few for socket to free up
time.sleep(0.05)
attempts = 0
while is_server_listening(context.server_fqdn, context.server_port):
print(f"stopping server pid={context.server_process.pid} ...")
os.kill(context.server_process.pid, SIGKILL)
time.sleep(0.1)
attempts += 1
if attempts > 5:
print(f"Server dangling exits, killing all {context.server_path} ...")
process = subprocess.run(['killall', '-9', context.server_path],
stderr=subprocess.PIPE,
universal_newlines=True)
print(process)
def is_server_listening(server_fqdn, server_port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((server_fqdn, server_port))
return result == 0
def pid_exists(pid):
"""Check whether pid exists in the current process table."""
import errno
if pid < 0:
return False
try:
os.kill(pid, 0)
except OSError as e:
return e.errno == errno.EPERM
else:
return True

View file

@ -0,0 +1,4 @@
# List of ongoing issues
@bug
Feature: Issues
# No confirmed issue at the moment

View file

@ -0,0 +1,123 @@
@llama.cpp
Feature: Parallel
Background: Server startup
Given a server listening on localhost:8080
And a model file stories260K.gguf
And a model alias tinyllama-2
And 42 as server seed
And 64 KV cache size
And 2 slots
And embeddings extraction
And continuous batching
Then the server is starting
Then the server is healthy
Scenario Outline: Multi users completion
Given a prompt:
"""
Write a very long story about AI.
"""
And a prompt:
"""
Write another very long music lyrics.
"""
And <n_predict> max tokens to predict
Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all prompts are predicted with <n_predict> tokens
Examples:
| n_predict |
| 128 |
Scenario Outline: Multi users OAI completions compatibility
Given a system prompt You are a writer.
And a model tinyllama-2
Given a prompt:
"""
Write a very long book.
"""
And a prompt:
"""
Write another a poem.
"""
And <n_predict> max tokens to predict
And streaming is <streaming>
Given concurrent OAI completions requests
Then the server is busy
Then the server is idle
Then all prompts are predicted with <n_predict> tokens
Examples:
| streaming | n_predict |
| disabled | 128 |
| enabled | 64 |
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
Given a prompt:
"""
Write a very long story about AI.
"""
And a prompt:
"""
Write another very long music lyrics.
"""
And a prompt:
"""
Write a very long poem.
"""
And a prompt:
"""
Write a very long joke.
"""
And 128 max tokens to predict
Given concurrent completion requests
Then the server is busy
Then the server is idle
Then all prompts are predicted
Scenario: Multi users embeddings
Given a prompt:
"""
Write a very long story about AI.
"""
And a prompt:
"""
Write another very long music lyrics.
"""
And a prompt:
"""
Write a very long poem.
"""
And a prompt:
"""
Write a very long joke.
"""
Given concurrent embedding requests
Then the server is busy
Then the server is idle
Then all embeddings are generated
Scenario: Multi users OAI compatibility embeddings
Given a prompt:
"""
In which country Paris is located ?
"""
And a prompt:
"""
Is Madrid the capital of Spain ?
"""
And a prompt:
"""
What is the biggest US city ?
"""
And a prompt:
"""
What is the capital of Bulgaria ?
"""
And a model tinyllama-2
Given concurrent OAI embedding requests
Then the server is busy
Then the server is idle
Then all embeddings are generated

View file

@ -0,0 +1,50 @@
@llama.cpp
Feature: Security
Background: Server startup with an api key defined
Given a server listening on localhost:8080
And a model file stories260K.gguf
And a server api key llama.cpp
Then the server is starting
Then the server is healthy
Scenario Outline: Completion with some user api key
Given a prompt test
And a user api key <api_key>
And 4 max tokens to predict
And a completion request with <api_error> api error
Examples: Prompts
| api_key | api_error |
| llama.cpp | no |
| llama.cpp | no |
| hackeme | raised |
| | raised |
Scenario Outline: OAI Compatibility
Given a system prompt test
And a user prompt test
And a model test
And 2 max tokens to predict
And streaming is disabled
And a user api key <api_key>
Given an OAI compatible chat completions request with <api_error> api error
Examples: Prompts
| api_key | api_error |
| llama.cpp | no |
| llama.cpp | no |
| hackme | raised |
Scenario Outline: CORS Options
When an OPTIONS request is sent from <origin>
Then CORS header <cors_header> is set to <cors_header_value>
Examples: Headers
| origin | cors_header | cors_header_value |
| localhost | Access-Control-Allow-Origin | localhost |
| web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr |
| origin | Access-Control-Allow-Credentials | true |
| web.mydomain.fr | Access-Control-Allow-Methods | POST |
| web.mydomain.fr | Access-Control-Allow-Headers | * |

View file

@ -0,0 +1,84 @@
@llama.cpp
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model file stories260K.gguf
And a model alias tinyllama-2
And 42 as server seed
# KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130
# see --ctx-size and #5568
And 32 KV cache size
And 1 slots
And embeddings extraction
And 32 server max tokens to predict
And prometheus compatible metrics exposed
Then the server is starting
Then the server is healthy
Scenario: Health
Then the server is ready
And all slots are idle
Scenario Outline: Completion
Given a prompt <prompt>
And <n_predict> max tokens to predict
And a completion request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
And prometheus metrics are exposed
Examples: Prompts
| prompt | n_predict | re_content | n_predicted |
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 |
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 |
Scenario Outline: OAI Compatibility
Given a model <model>
And a system prompt <system_prompt>
And a user prompt <user_prompt>
And <max_tokens> max tokens to predict
And streaming is <enable_streaming>
Given an OAI compatible chat completions request with no api error
Then <n_predicted> tokens are predicted matching <re_content>
Examples: Prompts
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
| llama-2 | Book | What is the best book | 8 | (Mom<or>what)+ | 8 | disabled |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks<or>happy<or>bird)+ | 32 | enabled |
Scenario: Embedding
When embeddings are computed for:
"""
What is the capital of Bulgaria ?
"""
Then embeddings are generated
Scenario: OAI Embeddings compatibility
Given a model tinyllama-2
When an OAI compatible embeddings computation request for:
"""
What is the capital of Spain ?
"""
Then embeddings are generated
Scenario: OAI Embeddings compatibility with multiple inputs
Given a model tinyllama-2
Given a prompt:
"""
In which country Paris is located ?
"""
And a prompt:
"""
Is Madrid the capital of Spain ?
"""
When an OAI compatible embeddings computation request for multiple inputs
Then embeddings are generated
Scenario: Tokenize / Detokenize
When tokenizing:
"""
What is the capital of France ?
"""
Then tokens can be detokenize

View file

@ -0,0 +1,801 @@
import asyncio
import collections
import json
import os
import re
import socket
import subprocess
import time
from contextlib import closing
from re import RegexFlag
import aiohttp
import openai
from behave import step
from behave.api.async_step import async_run_until_complete
from prometheus_client import parser
@step(u"a server listening on {server_fqdn}:{server_port}")
def step_server_config(context, server_fqdn, server_port):
context.server_fqdn = server_fqdn
context.server_port = int(server_port)
if 'PORT' in os.environ:
context.server_port = int(os.environ['PORT'])
print(f"$PORT set, overriding server port with to {context.server_port}")
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
context.model_alias = None
context.n_ctx = None
context.n_predict = None
context.n_server_predict = None
context.n_slots = None
context.server_api_key = None
context.server_continuous_batching = False
context.server_embeddings = False
context.server_metrics = False
context.server_process = None
context.server_seed = None
context.user_api_key = None
context.tasks_result = []
context.concurrent_tasks = []
context.prompts = []
@step(u'a model file {model_file}')
def step_model_file(context, model_file):
context.model_file = model_file
@step(u'a model alias {model_alias}')
def step_model_alias(context, model_alias):
context.model_alias = model_alias
@step(u'{seed} as server seed')
def step_seed(context, seed):
context.server_seed = int(seed)
@step(u'{n_ctx} KV cache size')
def step_n_ctx(context, n_ctx):
context.n_ctx = int(n_ctx)
@step(u'{n_slots} slots')
def step_n_slots(context, n_slots):
context.n_slots = int(n_slots)
@step(u'{n_predict} server max tokens to predict')
def step_server_n_predict(context, n_predict):
context.n_server_predict = int(n_predict)
@step(u'continuous batching')
def step_server_continuous_batching(context):
context.server_continuous_batching = True
@step(u'embeddings extraction')
def step_server_embeddings(context):
context.server_embeddings = True
@step(u'prometheus compatible metrics exposed')
def step_server_metrics(context):
context.server_metrics = True
@step(u"the server is starting")
def step_start_server(context):
start_server_background(context)
attempts = 0
while True:
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((context.server_fqdn, context.server_port))
if result == 0:
print("\x1b[33;46mserver started!\x1b[0m")
return
attempts += 1
if attempts > 20:
assert False, "server not started"
print(f"waiting for server to start, connect error code = {result}...")
time.sleep(0.1)
@step(u"the server is {expecting_status}")
@async_run_until_complete
async def step_wait_for_the_server_to_be_started(context, expecting_status):
match expecting_status:
case 'healthy':
await wait_for_health_status(context, context.base_url, 200, 'ok')
case 'ready' | 'idle':
await wait_for_health_status(context, context.base_url, 200, 'ok',
params={'fail_on_no_slot': 0, 'include_slots': 0},
slots_idle=context.n_slots,
slots_processing=0,
expected_slots=[{'id': slot_id, 'state': 0}
for slot_id in range(context.n_slots)])
case 'busy':
await wait_for_health_status(context, context.base_url, 503,
'no slot available',
params={'fail_on_no_slot': 0, 'include_slots': 0},
slots_idle=0,
slots_processing=context.n_slots,
expected_slots=[{'id': slot_id, 'state': 1}
for slot_id in range(context.n_slots)])
case _:
assert False, "unknown status"
@step(u'all slots are {expected_slot_status_string}')
@async_run_until_complete
async def step_all_slots_status(context, expected_slot_status_string):
match expected_slot_status_string:
case 'idle':
expected_slot_status = 0
case 'busy':
expected_slot_status = 1
case _:
assert False, "unknown status"
expected_slots = [{'id': slot_id, 'state': expected_slot_status}
for slot_id in range(context.n_slots)]
await request_slots_status(context, expected_slots)
@step(u'a completion request with {api_error} api error')
@async_run_until_complete
async def step_request_completion(context, api_error):
expect_api_error = api_error == 'raised'
completion = await request_completion(context.prompts.pop(),
context.base_url,
debug=context.debug,
n_predict=context.n_predict,
server_seed=context.server_seed,
expect_api_error=expect_api_error,
user_api_key=context.user_api_key)
context.tasks_result.append(completion)
if context.debug:
print(f"Completion response: {completion}")
if expect_api_error:
assert completion == 401, f"completion must be an 401 status code: {completion}"
@step(u'{predicted_n} tokens are predicted matching {re_content}')
def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content)
@step(u'{predicted_n} tokens are predicted')
def step_n_tokens_predicted(context, predicted_n):
assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n))
@step(u'a user prompt {user_prompt}')
def step_user_prompt(context, user_prompt):
context.prompts.append(user_prompt)
@step(u'a system prompt {system_prompt}')
def step_system_prompt(context, system_prompt):
context.system_prompt = system_prompt
@step(u'a model {model}')
def step_model(context, model):
context.model = model
@step(u'{max_tokens} max tokens to predict')
def step_max_tokens(context, max_tokens):
context.n_predict = int(max_tokens)
@step(u'streaming is {enable_streaming}')
def step_streaming(context, enable_streaming):
context.enable_streaming = enable_streaming == 'enabled'
@step(u'a user api key {user_api_key}')
def step_user_api_key(context, user_api_key):
context.user_api_key = user_api_key
@step(u'no user api key')
def step_no_user_api_key(context):
context.user_api_key = None
@step(u'a user api key ')
def step_no_user_api_key_space(context):
context.user_api_key = None
@step(u'a server api key {server_api_key}')
def step_server_api_key(context, server_api_key):
context.server_api_key = server_api_key
@step(u'an OAI compatible chat completions request with {api_error} api error')
@async_run_until_complete
async def step_oai_chat_completions(context, api_error):
if context.debug:
print(f"Submitting OAI compatible completions request...")
expect_api_error = api_error == 'raised'
completion = await oai_chat_completions(context.prompts.pop(),
context.system_prompt,
context.base_url,
False,
model=context.model if hasattr(context, 'model') else None,
n_predict=context.n_predict
if hasattr(context, 'n_predict') else None,
enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None,
server_seed=context.server_seed
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None,
expect_api_error=expect_api_error)
context.tasks_result.append(completion)
if context.debug:
print(f"Completion response: {completion}")
if expect_api_error:
assert completion == 401, f"completion must be an 401 status code: {completion}"
if context.debug:
print(f"Completion response: {completion}")
@step(u'a prompt')
def step_a_prompt(context):
context.prompts.append(context.text)
@step(u'a prompt {prompt}')
def step_a_prompt_prompt(context, prompt):
context.prompts.append(prompt)
@step(u'concurrent completion requests')
@async_run_until_complete()
async def step_concurrent_completion_requests(context):
await concurrent_requests(context,
request_completion,
# prompt is inserted automatically
context.base_url,
debug=context.debug,
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key if hasattr(context,
'user_api_key') else None)
@step(u'concurrent OAI completions requests')
@async_run_until_complete
async def step_oai_chat_completions(context):
await concurrent_requests(context, oai_chat_completions,
# user_prompt is inserted automatically
context.system_prompt,
context.base_url,
True, # async_client
model=context.model
if hasattr(context, 'model') else None,
n_predict=context.n_predict
if hasattr(context, 'n_predict') else None,
enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None,
server_seed=context.server_seed
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None)
@step(u'all prompts are predicted')
@async_run_until_complete
async def step_all_prompts_are_predicted(context):
await all_prompts_are_predicted(context)
@step(u'all prompts are predicted with {n_predict} tokens')
@async_run_until_complete
async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict):
expected_predicted_n = int(n_predict)
await all_prompts_are_predicted(context, expected_predicted_n)
async def all_prompts_are_predicted(context, expected_predicted_n=None):
n_completions = await gather_tasks_results(context)
assert n_completions > 0
for i in range(n_completions):
assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n)
assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
@step(u'embeddings are computed for')
@async_run_until_complete
async def step_compute_embedding(context):
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
@step(u'embeddings are generated')
def step_assert_embeddings(context):
if len(context.prompts) == 0:
assert_embeddings(context.embeddings)
else:
assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
f"context.prompts={context.prompts}\n"
f"context.embeddings={context.embeddings}")
for embedding in context.embeddings:
context.prompts.pop()
assert_embeddings(embedding)
@step(u'an OAI compatible embeddings computation request for')
@async_run_until_complete
async def step_oai_compute_embeddings(context):
context.embeddings = await request_oai_embeddings(context.text,
base_url=context.base_url,
user_api_key=context.user_api_key,
model=context.model)
@step(u'an OAI compatible embeddings computation request for multiple inputs')
@async_run_until_complete
async def step_oai_compute_embeddings_multiple_inputs(context):
context.embeddings = await request_oai_embeddings(context.prompts,
base_url=context.base_url,
user_api_key=context.user_api_key,
model=context.model)
@step(u'concurrent embedding requests')
@async_run_until_complete()
async def step_concurrent_embedding_requests(context):
await concurrent_requests(context,
request_embedding,
# prompt is inserted automatically
base_url=context.base_url)
@step(u'concurrent OAI embedding requests')
@async_run_until_complete()
async def step_concurrent_oai_embedding_requests(context):
await concurrent_requests(context,
request_oai_embeddings,
# prompt is inserted automatically
base_url=context.base_url,
async_client=True,
model=context.model)
@step(u'all embeddings are generated')
@async_run_until_complete()
async def all_embeddings_are_generated(context):
n_embedding_requests = await gather_tasks_results(context)
assert n_embedding_requests > 0
for i in range(n_embedding_requests):
assert_embeddings(context.tasks_result.pop())
@step(u'tokenizing')
@async_run_until_complete
async def step_tokenize(context):
context.tokenized_text = context.text
async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/tokenize',
json={
"content": context.tokenized_text,
}) as response:
assert response.status == 200
tokenize_json = await response.json()
context.tokens = tokenize_json['tokens']
@step(u'tokens can be detokenize')
@async_run_until_complete
async def step_detokenize(context):
assert len(context.tokens) > 0
async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/detokenize',
json={
"tokens": context.tokens,
}) as response:
assert response.status == 200
detokenize_json = await response.json()
# SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
assert context.tokenized_text == detokenize_json['content'].strip()
@step(u'an OPTIONS request is sent from {origin}')
@async_run_until_complete
async def step_options_request(context, origin):
async with aiohttp.ClientSession() as session:
async with session.options(f'{context.base_url}/v1/chat/completions',
headers={"Origin": origin}) as response:
assert response.status == 200
context.options_response = response
@step(u'CORS header {cors_header} is set to {cors_header_value}')
def step_check_options_header_value(context, cors_header, cors_header_value):
assert context.options_response.headers[cors_header] == cors_header_value
@step(u'prometheus metrics are exposed')
@async_run_until_complete
async def step_prometheus_metrics_exported(context):
async with aiohttp.ClientSession() as session:
async with await session.get(f'{context.base_url}/metrics') as metrics_response:
assert metrics_response.status == 200
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
metrics_raw = await metrics_response.text()
metric_exported = False
for metric in parser.text_string_to_metric_families(metrics_raw):
match metric.name:
case "llamacpp:kv_cache_usage_ratio":
assert len(metric.samples) > 0
metric_exported = True
assert metric_exported, "No metrics exported"
async def concurrent_requests(context, f_completion, *args, **kwargs):
n_prompts = len(context.prompts)
if context.debug:
print(f"starting {n_prompts} concurrent completion requests...")
assert n_prompts > 0
for prompt_no in range(n_prompts):
shifted_args = [context.prompts.pop(), *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
await asyncio.sleep(0.1)
async def request_completion(prompt,
base_url,
debug=False,
n_predict=None,
server_seed=None,
expect_api_error=None,
user_api_key=None):
if debug:
print(f"Sending completion request: {prompt}")
origin = "my.super.domain"
headers = {
'Origin': origin
}
if user_api_key is not None:
if debug:
print(f"Set user_api_key: {user_api_key}")
headers['Authorization'] = f'Bearer {user_api_key}'
async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/completion',
json={
"prompt": prompt,
"n_predict": int(n_predict) if n_predict is not None else -1,
"seed": server_seed if server_seed is not None else 42
},
headers=headers) as response:
if expect_api_error is None or not expect_api_error:
assert response.status == 200
assert response.headers['Access-Control-Allow-Origin'] == origin
return await response.json()
else:
return response.status
async def oai_chat_completions(user_prompt,
system_prompt,
base_url,
async_client,
debug=False,
model=None,
n_predict=None,
enable_streaming=None,
server_seed=None,
user_api_key=None,
expect_api_error=None):
if debug:
print(f"Sending OAI Chat completions request: {user_prompt}")
# openai client always expects an api key
user_api_key = user_api_key if user_api_key is not None else 'nope'
seed = server_seed if server_seed is not None else 42
enable_streaming = enable_streaming if enable_streaming is not None else False
payload = {
"messages": [
{
"role": "system",
"content": system_prompt,
},
{
"role": "user",
"content": user_prompt,
}
],
"model": model,
"max_tokens": n_predict,
"stream": enable_streaming,
"seed": seed
}
completion_response = {
'content': '',
'timings': {
'predicted_n': 0
}
}
if async_client:
origin = 'llama.cpp'
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/v1/chat/completions',
json=payload,
headers=headers) as response:
if enable_streaming:
assert response.status == 200
assert response.headers['Access-Control-Allow-Origin'] == origin
assert response.headers['Content-Type'] == "text/event-stream"
event_received = True
while event_received:
event_received = False
async for line_in_bytes in response.content:
line = line_in_bytes.decode('utf8')
line = line.rstrip('\n').rstrip('\r')
if line == '':
continue
event_data = line.split(': ', 1)
assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
chunk_raw = event_data[1]
chunk = json.loads(chunk_raw)
assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
delta = chunk['choices'][0]['delta']
if 'content' in delta:
completion_response['content'] += delta['content']
completion_response['timings']['predicted_n'] += 1
else:
if expect_api_error is None or not expect_api_error:
assert response.status == 200
assert response.headers['Access-Control-Allow-Origin'] == origin
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
chat_completion_raw = await response.json()
completion_response = {
'content': chat_completion_raw['choices'][0]['message'],
'timings': {
'predicted_n': chat_completion_raw['usage']['completion_tokens']
}
}
else:
return response.status
else:
try:
openai.api_key = user_api_key
openai.api_base = f'{base_url}/v1/chat'
chat_completion = openai.Completion.create(
messages=payload['messages'],
model=model,
max_tokens=n_predict,
stream=enable_streaming,
seed=seed
)
except openai.error.APIError as e:
if expect_api_error is not None and expect_api_error:
return 401
else:
assert False, f'error raised: {e}'
if enable_streaming:
for chunk in chat_completion:
assert len(chunk.choices) == 1
delta = chunk.choices[0].delta
if 'content' in delta:
completion_response['content'] += delta['content']
completion_response['timings']['predicted_n'] += 1
else:
assert len(chat_completion.choices) == 1
completion_response = {
'content': chat_completion.choices[0].message.content,
'timings': {
'predicted_n': chat_completion.usage.completion_tokens
}
}
if debug:
print("OAI response formatted to llama.cpp:", completion_response)
return completion_response
async def request_embedding(content, base_url=None):
async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/embedding',
json={
"content": content,
}) as response:
assert response.status == 200
response_json = await response.json()
return response_json['embedding']
async def request_oai_embeddings(input,
base_url=None, user_api_key=None,
model=None, async_client=False):
# openai client always expects an api_key
user_api_key = user_api_key if user_api_key is not None else 'nope'
if async_client:
origin = 'llama.cpp'
if user_api_key is not None:
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/v1/embeddings',
json={
"input": input,
"model": model,
},
headers=headers) as response:
assert response.status == 200, f"received status code not expected: {response.status}"
assert response.headers['Access-Control-Allow-Origin'] == origin
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
response_json = await response.json()
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
assert response_json['object'] == 'list'
return response_json['data']
else:
openai.api_key = user_api_key
openai.api_base = f'{base_url}/v1'
oai_embeddings = openai.Embedding.create(
model=model,
input=input,
)
if isinstance(input, collections.abc.Sequence):
embeddings = []
for an_oai_embeddings in oai_embeddings.data:
embeddings.append(an_oai_embeddings.embedding)
else:
embeddings = oai_embeddings.data.embedding
return embeddings
def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
content = completion_response['content']
n_predicted = completion_response['timings']['predicted_n']
assert len(content) > 0, "no token predicted"
if expected_predicted_n is not None:
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
f' {n_predicted} <> {expected_predicted_n}')
if re_content is not None:
re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
f'invalid tokens predicted:'
f' ```\n{content}\n``` do not match /{re_content}/')
async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks)
if context.debug:
print(f"Waiting for all {n_tasks} tasks results...")
for task_no in range(n_tasks):
context.tasks_result.append(await context.concurrent_tasks.pop())
n_completions = len(context.tasks_result)
return n_completions
async def wait_for_health_status(context,
base_url,
expected_http_status_code,
expected_health_status,
params=None,
slots_idle=None,
slots_processing=None,
expected_slots=None):
if context.debug:
print(f"Starting checking for health for expected_health_status={expected_health_status}")
timeout = 3 # seconds
interval = 0.5
counter = 0
async with aiohttp.ClientSession() as session:
while True:
async with await session.get(f'{base_url}/health', params=params) as health_response:
status_code = health_response.status
health = await health_response.json()
if context.debug:
print(f"HEALTH - response for expected health status='{expected_health_status}' on "
f"'{base_url}/health'?{params} is {health}")
if (status_code == expected_http_status_code
and health['status'] == expected_health_status
and (slots_idle is None or health['slots_idle'] == slots_idle)
and (slots_processing is None or health['slots_processing'] == slots_processing)):
if expected_slots is not None:
assert_slots_status(health['slots'], expected_slots)
return
if (status_code == expected_http_status_code
and health['status'] == expected_health_status
and (slots_idle is None or health['slots_idle'] == slots_idle)
and (slots_processing is None or health['slots_processing'] == slots_processing)):
if expected_slots is not None:
assert_slots_status(health['slots'], expected_slots)
return
await asyncio.sleep(interval)
counter += interval
if counter >= timeout:
# Sometimes health requests are triggered after completions are predicted
if expected_http_status_code == 503:
if len(context.tasks_result) == 0:
print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
" busy health check missed, probably too fast inference\x1b[0m")
n_completions = await gather_tasks_results(context)
if n_completions > 0:
return
assert False, 'timeout exceeded'
def assert_embeddings(embeddings):
assert len(embeddings) > 0
embeddings_computed = False
for emb in embeddings:
if emb != 0:
embeddings_computed = True
assert embeddings_computed, f"Embeddings: {embeddings}"
async def request_slots_status(context, expected_slots):
async with aiohttp.ClientSession() as session:
async with await session.get(f'{context.base_url}/slots') as slots_response:
assert slots_response.status == 200
slots = await slots_response.json()
assert_slots_status(slots, expected_slots)
def assert_slots_status(slots, expected_slots):
assert len(slots) == len(expected_slots)
for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)):
for key in expected:
assert expected[key] == slot[key], (f"invalid slot {slot_id}"
f" expected[{key}] != slot[{key}]"
f" = {expected[key]} != {slot[key]}")
def start_server_background(context):
context.server_path = '../../../build/bin/server'
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
server_args = [
'--host', context.server_fqdn,
'--port', context.server_port,
'--model', context.model_file
]
if context.server_continuous_batching:
server_args.append('--cont-batching')
if context.server_embeddings:
server_args.append('--embedding')
if context.server_metrics:
server_args.append('--metrics')
if context.model_alias is not None:
server_args.extend(['--alias', context.model_alias])
if context.n_ctx is not None:
server_args.extend(['--ctx-size', context.n_ctx])
if context.n_slots is not None:
server_args.extend(['--parallel', context.n_slots])
if context.n_server_predict is not None:
server_args.extend(['--n-predict', context.n_server_predict])
if context.server_api_key is not None:
server_args.extend(['--api-key', context.server_api_key])
if context.debug:
server_args.append('--verbose')
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"])
print(f"starting server with: {context.server_path}", *server_args)
context.server_process = subprocess.Popen(
[str(arg) for arg in [context.server_path, *server_args]],
close_fds=True)
print(f"server pid={context.server_process.pid}")

View file

@ -0,0 +1,21 @@
# run with ./test.sh --tags wrong_usage
@wrong_usage
Feature: Wrong usage of llama.cpp server
#3969 The user must always set --n-predict option
# to cap the number of tokens any completion request can generate
# or pass n_predict/max_tokens in the request.
Scenario: Infinite loop
Given a server listening on localhost:8080
And a model file stories260K.gguf
# Uncomment below to fix the issue
#And 64 server max tokens to predict
Then the server is starting
Given a prompt:
"""
Go to: infinite loop
"""
# Uncomment below to fix the issue
#And 128 max tokens to predict
Given concurrent completion requests
Then all prompts are predicted

View file

@ -0,0 +1,4 @@
aiohttp~=3.9.3
behave~=1.2.6
openai~=0.25.0
prometheus-client~=0.20.0

12
examples/server/tests/tests.sh Executable file
View file

@ -0,0 +1,12 @@
#!/bin/bash
set -eu
if [ $# -lt 1 ]
then
# Start @llama.cpp scenario
behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp
else
behave "$@"
fi

View file

@ -14,6 +14,7 @@
using json = nlohmann::json; using json = nlohmann::json;
extern bool server_verbose; extern bool server_verbose;
extern bool server_log_json;
#ifndef SERVER_VERBOSE #ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1 #define SERVER_VERBOSE 1
@ -27,14 +28,14 @@ extern bool server_verbose;
{ \ { \
if (server_verbose) \ if (server_verbose) \
{ \ { \
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
} \ } \
} while (0) } while (0)
#endif #endif
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
// //
// parallel // parallel
@ -49,7 +50,8 @@ enum server_state {
enum task_type { enum task_type {
TASK_TYPE_COMPLETION, TASK_TYPE_COMPLETION,
TASK_TYPE_CANCEL, TASK_TYPE_CANCEL,
TASK_TYPE_NEXT_RESPONSE TASK_TYPE_NEXT_RESPONSE,
TASK_TYPE_METRICS
}; };
struct task_server { struct task_server {
@ -132,26 +134,48 @@ struct completion_token_output
std::string text_to_send; std::string text_to_send;
}; };
static inline void server_log(const char *level, const char *function, int line, static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
const char *message, const nlohmann::ordered_json &extra)
{ {
nlohmann::ordered_json log std::stringstream ss_tid;
{ ss_tid << std::this_thread::get_id();
json log = nlohmann::ordered_json{
{"tid", ss_tid.str()},
{"timestamp", time(nullptr)}, {"timestamp", time(nullptr)},
{"level", level},
{"function", function},
{"line", line},
{"message", message},
}; };
if (!extra.empty()) if (server_log_json) {
{ log.merge_patch(
log.merge_patch(extra); {
} {"level", level},
{"function", function},
{"line", line},
{"msg", message},
});
if (!extra.empty()) {
log.merge_patch(extra);
}
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
printf("%.*s\n", (int)str.size(), str.data()); } else {
fflush(stdout); char buf[1024];
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
if (!extra.empty()) {
log.merge_patch(extra);
}
std::stringstream ss;
ss << buf << " |";
for (const auto& el : log.items())
{
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
ss << buf;
}
const std::string str = ss.str();
printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout);
}
} }
// //
@ -167,20 +191,47 @@ static T json_value(const json &body, const std::string &key, const T &default_v
: default_value; : default_value;
} }
inline std::string format_chatml(std::vector<json> messages) // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
{ inline bool verify_custom_template(const std::string & tmpl) {
std::ostringstream chatml_msgs; llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
return res >= 0;
}
for (auto it = messages.begin(); it != messages.end(); ++it) { // Format given chat. If tmpl is empty, we take the template from model metadata
chatml_msgs << "<|im_start|>" inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
<< json_value(*it, "role", std::string("user")) << '\n'; {
chatml_msgs << json_value(*it, "content", std::string("")) size_t alloc_size = 0;
<< "<|im_end|>\n"; // vector holding all allocated string to be passed to llama_chat_apply_template
std::vector<std::string> str(messages.size() * 2);
std::vector<llama_chat_message> chat(messages.size());
for (size_t i = 0; i < messages.size(); ++i) {
auto &curr_msg = messages[i];
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
alloc_size += str[i*2 + 1].length();
chat[i].role = str[i*2 + 0].c_str();
chat[i].content = str[i*2 + 1].c_str();
} }
chatml_msgs << "<|im_start|>assistant" << '\n'; const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
std::vector<char> buf(alloc_size * 2);
return chatml_msgs.str(); // run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
}
std::string formatted_chat(buf.data(), res);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
return formatted_chat;
} }
// //
@ -190,6 +241,7 @@ inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue { struct llama_server_queue {
int id = 0; int id = 0;
std::mutex mutex_tasks; std::mutex mutex_tasks;
bool running;
// queues // queues
std::vector<task_server> queue_tasks; std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred; std::vector<task_server> queue_tasks_deferred;
@ -205,6 +257,7 @@ struct llama_server_queue {
std::unique_lock<std::mutex> lock(mutex_tasks); std::unique_lock<std::mutex> lock(mutex_tasks);
if (task.id == -1) { if (task.id == -1) {
task.id = id++; task.id = id++;
LOG_VERBOSE("new task id", {{"new_id", task.id}});
} }
queue_tasks.push_back(std::move(task)); queue_tasks.push_back(std::move(task));
condition_tasks.notify_one(); condition_tasks.notify_one();
@ -220,7 +273,9 @@ struct llama_server_queue {
// Get the next id for creating anew task // Get the next id for creating anew task
int get_new_id() { int get_new_id() {
std::unique_lock<std::mutex> lock(mutex_tasks); std::unique_lock<std::mutex> lock(mutex_tasks);
return id++; int new_id = id++;
LOG_VERBOSE("new task id", {{"new_id", new_id}});
return new_id;
} }
// Register function to process a new task // Register function to process a new task
@ -248,12 +303,20 @@ struct llama_server_queue {
queue_tasks_deferred.clear(); queue_tasks_deferred.clear();
} }
// Start the main loop. This call is blocking // end the start_loop routine
[[noreturn]] void terminate() {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
running = false;
}
condition_tasks.notify_all();
}
// Start the main loop.
void start_loop() { void start_loop() {
running = true;
while (true) { while (true) {
// new task arrived LOG_VERBOSE("new task may arrive", {});
LOG_VERBOSE("have new task", {});
{ {
while (true) while (true)
{ {
@ -265,7 +328,7 @@ struct llama_server_queue {
task_server task = queue_tasks.front(); task_server task = queue_tasks.front();
queue_tasks.erase(queue_tasks.begin()); queue_tasks.erase(queue_tasks.begin());
lock.unlock(); lock.unlock();
LOG_VERBOSE("callback_new_task", {}); LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
callback_new_task(task); callback_new_task(task);
} }
LOG_VERBOSE("callback_all_task_finished", {}); LOG_VERBOSE("callback_all_task_finished", {});
@ -294,8 +357,12 @@ struct llama_server_queue {
{ {
std::unique_lock<std::mutex> lock(mutex_tasks); std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) { if (queue_tasks.empty()) {
if (!running) {
LOG_VERBOSE("ending start_loop", {});
return;
}
condition_tasks.wait(lock, [&]{ condition_tasks.wait(lock, [&]{
return !queue_tasks.empty(); return (!queue_tasks.empty() || !running);
}); });
} }
} }
@ -342,11 +409,13 @@ struct llama_server_response {
std::condition_variable condition_results; std::condition_variable condition_results;
void add_waiting_task_id(int task_id) { void add_waiting_task_id(int task_id) {
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
std::unique_lock<std::mutex> lock(mutex_results); std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(task_id); waiting_task_ids.insert(task_id);
} }
void remove_waiting_task_id(int task_id) { void remove_waiting_task_id(int task_id) {
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
std::unique_lock<std::mutex> lock(mutex_results); std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(task_id); waiting_task_ids.erase(task_id);
} }
@ -359,7 +428,6 @@ struct llama_server_response {
condition_results.wait(lock, [&]{ condition_results.wait(lock, [&]{
return !queue_results.empty(); return !queue_results.empty();
}); });
LOG_VERBOSE("condition_results unblock", {});
for (int i = 0; i < (int) queue_results.size(); i++) for (int i = 0; i < (int) queue_results.size(); i++)
{ {
@ -384,22 +452,22 @@ struct llama_server_response {
// Send a new result to a waiting task_id // Send a new result to a waiting task_id
void send(task_result result) { void send(task_result result) {
std::unique_lock<std::mutex> lock(mutex_results); std::unique_lock<std::mutex> lock(mutex_results);
LOG_VERBOSE("send new result", {}); LOG_VERBOSE("send new result", {{"task_id", result.id}});
for (auto& task_id : waiting_task_ids) { for (auto& task_id : waiting_task_ids) {
// LOG_TEE("waiting task id %i \n", task_id); // LOG_TEE("waiting task id %i \n", task_id);
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
if (result.multitask_id == task_id) if (result.multitask_id == task_id)
{ {
LOG_VERBOSE("callback_update_multitask", {}); LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
callback_update_multitask(task_id, result.id, result); callback_update_multitask(task_id, result.id, result);
continue; continue;
} }
if (result.id == task_id) if (result.id == task_id)
{ {
LOG_VERBOSE("queue_results.push_back", {}); LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
queue_results.push_back(result); queue_results.push_back(result);
condition_results.notify_one(); condition_results.notify_all();
return; return;
} }
} }

View file

@ -31,7 +31,8 @@ int main(int argc, char ** argv) {
// init LLM // init LLM
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
// initialize the model // initialize the model

View file

@ -50,7 +50,8 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(params.numa); llama_backend_init();
llama_numa_init(params.numa);
llama_model * model_tgt = NULL; llama_model * model_tgt = NULL;
llama_model * model_dft = NULL; llama_model * model_dft = NULL;

View file

@ -17,7 +17,7 @@ int main(int argc, char ** argv) {
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
llama_backend_init(false); llama_backend_init();
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
model_params.vocab_only = true; model_params.vocab_only = true;

View file

@ -1,5 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h"
#include "common.h" #include "common.h"
#include "train.h" #include "train.h"
#include "llama.h" #include "llama.h"
@ -19,8 +20,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static const size_t tensor_alignment = 32;
struct my_llama_hparams { struct my_llama_hparams {
uint32_t n_vocab = 32000; uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; uint32_t n_ctx = 512;
@ -51,14 +50,14 @@ struct my_llama_layer {
struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm;
// ff // ff
struct ggml_tensor * w1; struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * w2; struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * w3; struct ggml_tensor * ffn_up; // w3
}; };
struct my_llama_model { struct my_llama_model {
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
std::vector<uint8_t> data; ggml_backend_buffer_t data = NULL;
my_llama_hparams hparams; my_llama_hparams hparams;
@ -112,13 +111,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down";
static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
static void print_params(struct my_llama_hparams * params) { static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd); printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_head: %d\n", __func__, params->n_head); printf("%s: n_head: %u\n", __func__, params->n_head);
printf("%s: n_ff: %d\n", __func__, params->n_ff); printf("%s: n_ff: %u\n", __func__, params->n_ff);
printf("%s: n_layer: %d\n", __func__, params->n_layer); printf("%s: n_layer: %u\n", __func__, params->n_layer);
printf("%s: n_rot: %d\n", __func__, params->n_rot); printf("%s: n_rot: %u\n", __func__, params->n_rot);
} }
static void set_param_model(struct my_llama_model * model) { static void set_param_model(struct my_llama_model * model) {
@ -141,42 +140,9 @@ static void set_param_model(struct my_llama_model * model) {
ggml_set_param(ctx, layer.wv); ggml_set_param(ctx, layer.wv);
ggml_set_param(ctx, layer.wo); ggml_set_param(ctx, layer.wo);
ggml_set_param(ctx, layer.ffn_norm); ggml_set_param(ctx, layer.ffn_norm);
ggml_set_param(ctx, layer.w1); ggml_set_param(ctx, layer.ffn_gate);
ggml_set_param(ctx, layer.w2); ggml_set_param(ctx, layer.ffn_down);
ggml_set_param(ctx, layer.w3); ggml_set_param(ctx, layer.ffn_up);
}
}
static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) {
ggml_allocr_alloc(alloc, model->tok_embeddings);
ggml_allocr_alloc(alloc, model->norm);
ggml_allocr_alloc(alloc, model->output);
for (uint32_t i = 0; i < model->layers.size(); ++i) {
auto & layer = model->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm);
ggml_allocr_alloc(alloc, layer.wq);
ggml_allocr_alloc(alloc, layer.wk);
ggml_allocr_alloc(alloc, layer.wv);
ggml_allocr_alloc(alloc, layer.wo);
ggml_allocr_alloc(alloc, layer.ffn_norm);
ggml_allocr_alloc(alloc, layer.w1);
ggml_allocr_alloc(alloc, layer.w2);
ggml_allocr_alloc(alloc, layer.w3);
}
ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
ggml_allocr_alloc(alloc, model->norm->grad);
ggml_allocr_alloc(alloc, model->output->grad);
for (uint32_t i = 0; i < model->layers.size(); ++i) {
auto & layer = model->layers[i];
ggml_allocr_alloc(alloc, layer.attention_norm->grad);
ggml_allocr_alloc(alloc, layer.wq->grad);
ggml_allocr_alloc(alloc, layer.wk->grad);
ggml_allocr_alloc(alloc, layer.wv->grad);
ggml_allocr_alloc(alloc, layer.wo->grad);
ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
ggml_allocr_alloc(alloc, layer.w1->grad);
ggml_allocr_alloc(alloc, layer.w2->grad);
ggml_allocr_alloc(alloc, layer.w3->grad);
} }
} }
@ -232,9 +198,9 @@ static void init_model(struct my_llama_model * model) {
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i)); ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
@ -245,24 +211,15 @@ static void init_model(struct my_llama_model * model) {
ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i)); ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i));
ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i)); ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i));
ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i));
ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i));
} }
set_param_model(model); set_param_model(model);
// measure data size
size_t size = 0;
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
}
// allocate data // allocate data
struct ggml_allocr * alloc = NULL; model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
model->data.resize(size + tensor_alignment);
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
alloc_model(alloc, model);
} }
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
@ -287,9 +244,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
randomize_tensor_normal(layer.ffn_norm, rnd); randomize_tensor_normal(layer.ffn_norm, rnd);
randomize_tensor_normal(layer.w1, rnd); randomize_tensor_normal(layer.ffn_gate, rnd);
randomize_tensor_normal(layer.w2, rnd); randomize_tensor_normal(layer.ffn_down, rnd);
randomize_tensor_normal(layer.w3, rnd); randomize_tensor_normal(layer.ffn_up, rnd);
} }
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
@ -297,7 +254,7 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
static struct ggml_tensor * llama_build_train_graphs( static struct ggml_tensor * llama_build_train_graphs(
struct my_llama_model * model, struct my_llama_model * model,
struct ggml_allocr * alloc, ggml_gallocr_t alloc,
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_cgraph * gb, struct ggml_cgraph * gb,
@ -308,7 +265,8 @@ static struct ggml_tensor * llama_build_train_graphs(
const int n_tokens, const int n_tokens,
const int n_batch, const int n_batch,
const bool enable_flash_attn, const bool enable_flash_attn,
const bool enable_checkpointing) { const bool enable_checkpointing,
const bool measure_only) {
ggml_set_scratch(ctx, { 0, 0, nullptr, }); ggml_set_scratch(ctx, { 0, 0, nullptr, });
const int n_past = 0; const int n_past = 0;
@ -334,13 +292,7 @@ static struct ggml_tensor * llama_build_train_graphs(
// KQ_pos - contains the positions // KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
ggml_allocr_alloc(alloc, KQ_pos); ggml_set_input(KQ_pos);
if (!ggml_allocr_is_measure(alloc)) {
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
// rope has so much parameters that we make a custom function for it // rope has so much parameters that we make a custom function for it
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@ -404,11 +356,11 @@ static struct ggml_tensor * llama_build_train_graphs(
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
cur = t30; cur = t30;
checkpoints.push_back(cur); checkpoints.push_back(cur);
@ -448,21 +400,31 @@ static struct ggml_tensor * llama_build_train_graphs(
// KQ_pos // KQ_pos
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
ggml_set_input(t36->grad);
ggml_allocr_alloc(alloc, t36->grad);
// allocating checkpoints in one block to reduce memory fragmentation // allocating checkpoints in one block to reduce memory fragmentation
// note: they will be freed in reverse order // note: they will be freed in reverse order
for (int i = 0; i < (int) checkpoints.size(); ++i) { for (int i = 0; i < (int) checkpoints.size(); ++i) {
if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
ggml_allocr_alloc(alloc, checkpoints[i]); ggml_set_input(checkpoints[i]);
} }
} }
//int n_leafs_after = gb->n_leafs; //int n_leafs_after = gb->n_leafs;
//int n_nodes_after = gb->n_nodes; //int n_nodes_after = gb->n_nodes;
if (measure_only) {
// FIXME: will still allocate
ggml_gallocr_reserve(alloc, gb);
} else {
ggml_gallocr_alloc_graph(alloc, gb);
ggml_allocr_alloc_graph(alloc, gb); if (!measure_only) {
int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) {
data[i] = n_past + i;
}
}
}
// remove the additional nodes and leafs // remove the additional nodes and leafs
for (int i = n_leafs_before; i < gb->n_leafs; ++i) { for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
@ -559,9 +521,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
} }
} }
@ -702,9 +664,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
gguf_add_tensor(fctx, layer.wv); gguf_add_tensor(fctx, layer.wv);
gguf_add_tensor(fctx, layer.wo); gguf_add_tensor(fctx, layer.wo);
gguf_add_tensor(fctx, layer.ffn_norm); gguf_add_tensor(fctx, layer.ffn_norm);
gguf_add_tensor(fctx, layer.w1); gguf_add_tensor(fctx, layer.ffn_gate);
gguf_add_tensor(fctx, layer.w2); gguf_add_tensor(fctx, layer.ffn_down);
gguf_add_tensor(fctx, layer.w3); gguf_add_tensor(fctx, layer.ffn_up);
} }
} }
@ -953,9 +915,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
nx += ggml_nelements(layer.wv); nx += ggml_nelements(layer.wv);
nx += ggml_nelements(layer.wo); nx += ggml_nelements(layer.wo);
nx += ggml_nelements(layer.ffn_norm); nx += ggml_nelements(layer.ffn_norm);
nx += ggml_nelements(layer.w1); nx += ggml_nelements(layer.ffn_gate);
nx += ggml_nelements(layer.w2); nx += ggml_nelements(layer.ffn_down);
nx += ggml_nelements(layer.w3); nx += ggml_nelements(layer.ffn_up);
} }
return nx; return nx;
} }
@ -998,7 +960,7 @@ int main(int argc, char ** argv) {
struct ggml_opt_context * opt = train->opt; struct ggml_opt_context * opt = train->opt;
// set opt params from command line // set opt params from command line
opt->params = ggml_opt_default_params(GGML_OPT_ADAM); opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
opt->params.print_forward_graph = false; opt->params.print_forward_graph = false;
opt->params.print_backward_graph = false; opt->params.print_backward_graph = false;
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
@ -1046,7 +1008,7 @@ int main(int argc, char ** argv) {
printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples);
printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens);
printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f)); printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f));
if (params.only_write_model) { if (params.only_write_model) {
save_train_files_data save_data; save_train_files_data save_data;
@ -1073,11 +1035,6 @@ int main(int argc, char ** argv) {
int n_vocab = model.hparams.n_vocab; int n_vocab = model.hparams.n_vocab;
int n_batch = params.common.n_batch; int n_batch = params.common.n_batch;
std::vector<uint8_t> mem_input_data;
std::vector<uint8_t> mem_compute_data;
ggml_allocr * alloc = NULL;
// context for input tensors without their data // context for input tensors without their data
struct ggml_init_params ctx_input_params = { struct ggml_init_params ctx_input_params = {
ggml_tensor_overhead() * 2, // mem_size ggml_tensor_overhead() * 2, // mem_size
@ -1091,16 +1048,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
// measure required memory for input tensors // measure required memory for input tensors
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
tensor_alignment;
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors // allocate input tensors
mem_input_data.resize(max_input_size); ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); size_t max_input_size = ggml_backend_buffer_get_size(input_data);
ggml_allocr_alloc(alloc, tokens_input); printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
ggml_allocr_alloc(alloc, target_probs);
// context for compute tensors without their data // context for compute tensors without their data
const size_t estimated_compute_size_wo_data = ( const size_t estimated_compute_size_wo_data = (
@ -1127,7 +1078,7 @@ int main(int argc, char ** argv) {
// find best evaluation order // find best evaluation order
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new_measure(tensor_alignment); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = (enum ggml_cgraph_eval_order) order; gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1140,9 +1091,10 @@ int main(int argc, char ** argv) {
&logits, tokens_input, target_probs, &logits, tokens_input, target_probs,
n_tokens, n_batch, n_tokens, n_batch,
params.common.use_flash, params.common.use_flash,
params.common.use_checkpointing params.common.use_checkpointing,
true
); );
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
if (max_compute_size < best_compute_size) { if (max_compute_size < best_compute_size) {
best_compute_size = max_compute_size; best_compute_size = max_compute_size;
best_order = gf->order; best_order = gf->order;
@ -1157,9 +1109,8 @@ int main(int argc, char ** argv) {
"invalid"); "invalid");
// allocate compute tensors // allocate compute tensors
mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = best_order; gf->order = best_order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1172,7 +1123,8 @@ int main(int argc, char ** argv) {
&logits, tokens_input, target_probs, &logits, tokens_input, target_probs,
n_tokens, n_batch, n_tokens, n_batch,
params.common.use_flash, params.common.use_flash,
params.common.use_checkpointing params.common.use_checkpointing,
false
); );
std::vector<llama_token> train_tokens; std::vector<llama_token> train_tokens;

6
flake.lock generated
View file

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1706732774, "lastModified": 1708118438,
"narHash": "sha256-hqJlyJk4MRpcItGYMF+3uHe8HvxNETWvlGtLuVpqLU0=", "narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "b8b232ae7b8b144397fdb12d20f592e5e7c1a64d", "rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -150,6 +150,7 @@
packages = packages =
{ {
default = config.legacyPackages.llamaPackages.llama-cpp; default = config.legacyPackages.llamaPackages.llama-cpp;
vulkan = config.packages.default.override { useVulkan = true; };
} }
// lib.optionalAttrs pkgs.stdenv.isLinux { // lib.optionalAttrs pkgs.stdenv.isLinux {
opencl = config.packages.default.override { useOpenCL = true; }; opencl = config.packages.default.override { useOpenCL = true; };
@ -157,7 +158,6 @@
mpi-cpu = config.packages.default.override { useMpi = true; }; mpi-cpu = config.packages.default.override { useMpi = true; };
mpi-cuda = config.packages.default.override { useMpi = true; }; mpi-cuda = config.packages.default.override { useMpi = true; };
vulkan = config.packages.default.override { useVulkan = true; };
} }
// lib.optionalAttrs (system == "x86_64-linux") { // lib.optionalAttrs (system == "x86_64-linux") {
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp; rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;

File diff suppressed because it is too large Load diff

View file

@ -6,88 +6,62 @@
extern "C" { extern "C" {
#endif #endif
struct ggml_backend; typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
struct ggml_backend_buffer; typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
struct ggml_backend_buffer_type; typedef struct ggml_backend * ggml_backend_t;
//
// Legacy API
//
typedef struct ggml_allocr * ggml_allocr_t;
// initialize allocator for use with CPU backend only
GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
// initialize allocator for use with ggml-backend
GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
// tell the allocator to parse nodes following the order described in the list
// you should call this if your graph are optimized to execute out-of-order
GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
//
// ggml-backend v2 API
//
// Separate tensor and graph allocator objects
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
// The original API is kept as a wrapper around the new API
// Tensor allocator // Tensor allocator
typedef struct ggml_tallocr * ggml_tallocr_t; typedef struct ggml_tallocr * ggml_tallocr_t;
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment); GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment); GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size); GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc);
GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);
GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc);
GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
// Graph allocator // Graph allocator
/*
Example usage:
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
ggml_gallocr_reserve(galloc, build_graph(max_batch));
// allocate the graph
struct ggml_cgraph * graph = build_graph(batch);
ggml_gallocr_alloc_graph(galloc, graph);
printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
// evaluate the graph
ggml_backend_graph_compute(backend, graph);
*/
// special tensor flags for use with the graph allocator:
// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
// ggml_set_output(): output tensors are never freed and never overwritten
typedef struct ggml_gallocr * ggml_gallocr_t; typedef struct ggml_gallocr * ggml_gallocr_t;
GGML_API ggml_gallocr_t ggml_gallocr_new(void); GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n); // pre-allocate buffers from a measure graph - does not allocate or modify the graph
GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph); // call with a worst-case graph to avoid buffer reallocations
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
// returns false if the buffer allocation failed
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
// Allocate tensors from the allocators given by the hash table // automatic reallocation if the topology changes when using a single buffer
GGML_API void ggml_gallocr_alloc_graph_n( // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
ggml_gallocr_t galloc, GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
struct ggml_cgraph * graph,
struct ggml_hash_set hash_set,
ggml_tallocr_t * hash_node_talloc);
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
#ifdef __cplusplus #ifdef __cplusplus
} }

File diff suppressed because it is too large Load diff

View file

@ -83,8 +83,9 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_cpu_init(void); GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer // Create a backend buffer from an existing pointer
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
@ -129,11 +130,7 @@ extern "C" {
// in build_graph: // in build_graph:
build_graph(...) { build_graph(...) {
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) // manually assign nodes to a backend (optional, should not be needed in most cases)
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
ggml_allocr_alloc(alloc_cpu, tensor);
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
struct ggml_tensor * node = ggml_mul_mat(ctx, ...); struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
ggml_backend_sched_set_node_backend(sched, node, backend_gpu); ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
} }
@ -163,20 +160,19 @@ extern "C" {
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
// Get the number of splits of the last graph // Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler // Allocate and compute graph on the backend scheduler
GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs // Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute // Set a callback to be called for each resulting node during graph compute

File diff suppressed because it is too large Load diff

View file

@ -53,11 +53,23 @@ extern "C" {
// //
#include <arm_neon.h> #include <arm_neon.h>
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) (x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ((float) (x)) #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) (x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
__fp16 tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
__fp16 tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#else #else
@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9. // This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) #if !defined(GGML_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s; uint16_t s;
memcpy(&s, &f, sizeof(uint16_t)); memcpy(&s, &f, sizeof(uint16_t));
@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
} }
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif
#if !defined(GGML_FP32_TO_FP16)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif #endif
#define GGML_HASHTABLE_FULL ((size_t)-1) #define GGML_HASHTABLE_FULL ((size_t)-1)

View file

@ -61,6 +61,9 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_RMS_NORM,
GGML_METAL_KERNEL_TYPE_GROUP_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM,
@ -83,6 +86,9 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
//GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
@ -101,6 +107,9 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
@ -116,6 +125,9 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
@ -131,6 +143,9 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F16, GGML_METAL_KERNEL_TYPE_ROPE_F16,
GGML_METAL_KERNEL_TYPE_ALIBI_F32, GGML_METAL_KERNEL_TYPE_ALIBI_F32,
@ -176,7 +191,7 @@ struct ggml_metal_context {
// MSL code // MSL code
// TODO: move the contents here when ready // TODO: move the contents here when ready
// for now it is easier to work in a separate file // for now it is easier to work in a separate file
//static NSString * const msl_library_source = @"see metal.metal"; // static NSString * const msl_library_source = @"see metal.metal";
// Here to assist with NSBundle Path Hack // Here to assist with NSBundle Path Hack
@interface GGMLMetalClass : NSObject @interface GGMLMetalClass : NSObject
@ -272,6 +287,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
return NULL; return NULL;
} }
} else { } else {
#if GGML_METAL_EMBED_LIBRARY
GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
extern const char ggml_metallib_start[];
extern const char ggml_metallib_end[];
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
#else
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
NSString * sourcePath; NSString * sourcePath;
@ -294,6 +317,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
#endif
@autoreleasepool { @autoreleasepool {
// dictionary of preprocessor macros // dictionary of preprocessor macros
@ -433,6 +457,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
@ -455,6 +482,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
@ -473,6 +503,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
@ -488,6 +521,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
@ -503,6 +539,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
@ -687,6 +726,7 @@ static bool ggml_metal_graph_compute(
struct ggml_metal_context * ctx, struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) { struct ggml_cgraph * gf) {
@autoreleasepool {
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
edesc.dispatchType = MTLDispatchTypeSerial; edesc.dispatchType = MTLDispatchTypeSerial;
@ -727,6 +767,7 @@ static bool ggml_metal_graph_compute(
size_t offs_src0 = 0; size_t offs_src0 = 0;
size_t offs_src1 = 0; size_t offs_src1 = 0;
size_t offs_src2 = 0;
size_t offs_dst = 0; size_t offs_dst = 0;
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx]; id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
@ -745,6 +786,7 @@ static bool ggml_metal_graph_compute(
struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src0 = gf->nodes[i]->src[0];
struct ggml_tensor * src1 = gf->nodes[i]->src[1]; struct ggml_tensor * src1 = gf->nodes[i]->src[1];
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
struct ggml_tensor * dst = gf->nodes[i]; struct ggml_tensor * dst = gf->nodes[i];
switch (dst->op) { switch (dst->op) {
@ -806,6 +848,7 @@ static bool ggml_metal_graph_compute(
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil; id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil; id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
@ -1187,7 +1230,16 @@ static bool ggml_metal_graph_compute(
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
} }
const float scale = ((float *) dst->op_params)[0]; const float scale = ((float *) dst->op_params)[0];
const float max_bias = ((float *) dst->op_params)[1];
const int64_t nrows_x = ggml_nrows(src0);
const int64_t nrows_y = src0->ne[1];
const uint32_t n_head_kv = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -1196,11 +1248,20 @@ static bool ggml_metal_graph_compute(
} else { } else {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
} }
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; if (id_src2) {
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; } else {
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
[encoder setBytes:&scale length:sizeof(scale) atIndex:6]; }
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6];
[encoder setBytes:&scale length:sizeof(scale) atIndex:7];
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:9];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:10];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@ -1296,6 +1357,9 @@ static bool ggml_metal_graph_compute(
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
} }
@ -1430,6 +1494,24 @@ static bool ggml_metal_graph_compute(
nth1 = 16; nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
} break; } break;
case GGML_TYPE_IQ3_S:
{
nth0 = 4;
nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
} break;
case GGML_TYPE_IQ1_S:
{
nth0 = 4;
nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
} break;
case GGML_TYPE_IQ4_NL:
{
nth0 = 4;
nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
} break;
default: default:
{ {
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@ -1464,7 +1546,7 @@ static bool ggml_metal_graph_compute(
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S) { // || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
@ -1472,11 +1554,16 @@ static bool ggml_metal_graph_compute(
[encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_IQ3_XXS) { else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
const int mem_size = 256*4+128; const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
[encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_IQ4_NL) {
const int mem_size = 32*sizeof(float);
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q4_K) { else if (src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
@ -1513,8 +1600,6 @@ static bool ggml_metal_graph_compute(
// max size of the src1ids array in the kernel stack // max size of the src1ids array in the kernel stack
GGML_ASSERT(ne11 <= 512); GGML_ASSERT(ne11 <= 512);
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
const int64_t ne20 = src2 ? src2->ne[0] : 0; const int64_t ne20 = src2 ? src2->ne[0] : 0;
const int64_t ne21 = src2 ? src2->ne[1] : 0; const int64_t ne21 = src2 ? src2->ne[1] : 0;
const int64_t ne22 = src2 ? src2->ne[2] : 0; const int64_t ne22 = src2 ? src2->ne[2] : 0;
@ -1572,6 +1657,9 @@ static bool ggml_metal_graph_compute(
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break;
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
} }
@ -1709,6 +1797,24 @@ static bool ggml_metal_graph_compute(
nth1 = 16; nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
} break; } break;
case GGML_TYPE_IQ3_S:
{
nth0 = 4;
nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
} break;
case GGML_TYPE_IQ1_S:
{
nth0 = 4;
nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
} break;
case GGML_TYPE_IQ4_NL:
{
nth0 = 4;
nth1 = 16;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
} break;
default: default:
{ {
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
@ -1759,7 +1865,7 @@ static bool ggml_metal_graph_compute(
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) { src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S) { // || src2t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) { else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
@ -1767,11 +1873,16 @@ static bool ggml_metal_graph_compute(
[encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src2t == GGML_TYPE_IQ3_XXS) { else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) {
const int mem_size = 256*4+128; const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
[encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src2t == GGML_TYPE_IQ4_NL) {
const int mem_size = 32*sizeof(float);
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src2t == GGML_TYPE_Q4_K) { else if (src2t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
@ -1813,6 +1924,9 @@ static bool ggml_metal_graph_compute(
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break;
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
default: GGML_ASSERT(false && "not implemented"); default: GGML_ASSERT(false && "not implemented");
} }
@ -2148,8 +2262,8 @@ static bool ggml_metal_graph_compute(
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
switch (order) { switch (order) {
case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
default: GGML_ASSERT(false); default: GGML_ASSERT(false);
}; };
@ -2272,6 +2386,7 @@ static bool ggml_metal_graph_compute(
[[MTLCaptureManager sharedCaptureManager] stopCapture]; [[MTLCaptureManager sharedCaptureManager] stopCapture];
} }
}
return true; return true;
} }

File diff suppressed because it is too large Load diff

View file

@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
} }
void ggml_cl_free_data(const struct ggml_tensor* tensor) { void ggml_cl_free_data(const struct ggml_tensor* tensor) {
if (tensor->backend != GGML_BACKEND_GPU) { if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
return; return;
} }
@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
} }
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2]; const int64_t ne02 = src0->ne[2];
@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
} }
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2]; const int64_t ne02 = src0->ne[2];
@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
size_t y_size; size_t y_size;
size_t d_size; size_t d_size;
cl_mem d_X; cl_mem d_X;
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
d_X = (cl_mem) src0->extra; d_X = (cl_mem) src0->extra;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
} }
cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
size_t x_offset = 0; size_t x_offset = 0;
@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
// TODO: copy src0 here when r3>1 // TODO: copy src0 here when r3>1
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
if (src0->backend == GGML_BACKEND_GPU) { if (src0->backend == GGML_BACKEND_TYPE_GPU) {
x_offset = (i03 * ne02 + i02) * x_ne; x_offset = (i03 * ne02 + i02) * x_ne;
} else { } else {
// copy src0 to device // copy src0 to device
@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
// copy src1 to device // copy src1 to device
if (src1->backend == GGML_BACKEND_CPU) { if (src1->backend == GGML_BACKEND_TYPE_CPU) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
} }
@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
} }
// copy dst to host // copy dst to host
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
} }
@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
} }
} }
if (src0->backend != GGML_BACKEND_GPU) { if (src0->backend != GGML_BACKEND_TYPE_GPU) {
ggml_cl_pool_free(d_X, x_size); ggml_cl_pool_free(d_X, x_size);
} }
if (src1->backend != GGML_BACKEND_GPU) { if (src1->backend != GGML_BACKEND_TYPE_GPU) {
ggml_cl_pool_free(d_Y, y_size); ggml_cl_pool_free(d_Y, y_size);
} }
if (dst->backend != GGML_BACKEND_GPU) { if (dst->backend != GGML_BACKEND_TYPE_GPU) {
ggml_cl_pool_free(d_D, d_size); ggml_cl_pool_free(d_D, d_size);
} }
} }
@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
size_t y_size; size_t y_size;
size_t d_size; size_t d_size;
cl_mem d_X; cl_mem d_X;
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
d_X = (cl_mem) src0->extra; d_X = (cl_mem) src0->extra;
} else { } else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
// TODO: copy src0 here when r3>1 // TODO: copy src0 here when r3>1
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
if (src0->backend == GGML_BACKEND_GPU) { if (src0->backend == GGML_BACKEND_TYPE_GPU) {
x_offset = (i03 * ne02 + i02) * x_ne; x_offset = (i03 * ne02 + i02) * x_ne;
} else { } else {
// copy src0 to device // copy src0 to device
@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
} }
// copy dst to host, then convert to float // copy dst to host, then convert to float
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_fp16_to_fp32_row(tmp, d, d_ne); ggml_fp16_to_fp32_row(tmp, d, d_ne);
@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
} }
} }
if (src0->backend != GGML_BACKEND_GPU) { if (src0->backend != GGML_BACKEND_TYPE_GPU) {
ggml_cl_pool_free(d_X, x_size); ggml_cl_pool_free(d_X, x_size);
} }
ggml_cl_pool_free(d_Y, y_size); ggml_cl_pool_free(d_Y, y_size);
@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
cl_mem d_Q; cl_mem d_Q;
if (src0->backend == GGML_BACKEND_CPU) { if (src0->backend == GGML_BACKEND_TYPE_CPU) {
d_Q = ggml_cl_pool_malloc(q_sz, &q_size); d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
} }
@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
// copy src0 to device if necessary // copy src0 to device if necessary
if (src0->backend == GGML_BACKEND_CPU) { if (src0->backend == GGML_BACKEND_TYPE_CPU) {
events.emplace_back(); events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
} else if (src0->backend == GGML_BACKEND_GPU) { } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
d_Q = (cl_mem) src0->extra; d_Q = (cl_mem) src0->extra;
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
if (!mul_mat_vec) { if (!mul_mat_vec) {
// convert src0 to fp32 on device // convert src0 to fp32 on device
const size_t global = x_ne / global_denom; const size_t global = x_ne / global_denom;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
// compute // compute
const size_t global = ne01 * local; const size_t global = ne01 * local;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
const cl_int ncols = ne00; const cl_int ncols = ne00;
events.emplace_back(); events.emplace_back();
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
} }
ggml_cl_pool_free(d_Y, y_size); ggml_cl_pool_free(d_Y, y_size);
ggml_cl_pool_free(d_D, d_size); ggml_cl_pool_free(d_D, d_size);
if (src0->backend == GGML_BACKEND_CPU) { if (src0->backend == GGML_BACKEND_TYPE_CPU) {
ggml_cl_pool_free(d_Q, q_size); ggml_cl_pool_free(d_Q, q_size);
} }
} }
@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
return true; return true;
} }
@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
CL_CHECK(clFinish(queue)); CL_CHECK(clFinish(queue));
tensor->extra = dst; tensor->extra = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
} }
// ggml-backend // ggml-backend
@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
ctx->sub_buffers.push_back(sub_buffer); ctx->sub_buffers.push_back(sub_buffer);
tensor->extra = sub_buffer; tensor->extra = sub_buffer;
} }
tensor->backend = GGML_BACKEND_GPU; tensor->backend = GGML_BACKEND_TYPE_GPU;
} }
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {

File diff suppressed because it is too large Load diff

View file

@ -191,6 +191,36 @@ typedef struct {
} block_iq3_xxs; } block_iq3_xxs;
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
// 3.4375 bpw
#if QK_K == 64
#define IQ3S_N_SCALE 2
#else
#define IQ3S_N_SCALE QK_K/64
#endif
typedef struct {
ggml_fp16_t d;
uint8_t qs[QK_K/4];
uint8_t qh[QK_K/32];
uint8_t signs[QK_K/8];
uint8_t scales[IQ3S_N_SCALE];
} block_iq3_s;
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
typedef struct {
ggml_fp16_t d;
uint8_t qs[QK_K/8];
uint8_t scales[QK_K/16];
} block_iq1_s;
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
// Non-linear quants
#define QK4_NL 32
typedef struct {
ggml_fp16_t d;
uint8_t qs[QK4_NL/2];
} block_iq4_nl;
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -210,6 +240,8 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@ -225,6 +257,8 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
// Dequantization // Dequantization
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@ -243,22 +277,28 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
// Dot product // Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
// //
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@ -266,6 +306,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@ -276,8 +319,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row,
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
void iq2xs_init_impl(int grid_size); void iq2xs_init_impl(enum ggml_type type);
void iq2xs_free_impl(int grid_size); void iq2xs_free_impl(enum ggml_type type);
void iq3xs_init_impl(int grid_size); void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size); void iq3xs_free_impl(int grid_size);

View file

@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
size_t total_elements = ggml_nelements(src); size_t total_elements = ggml_nelements(src);
const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT; const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
float *src_data =NULL; float *src_data =NULL;
if(src_on_device) { if(src_on_device) {
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
int ixj = col ^ j; int ixj = col ^ j;
if (ixj > col) { if (ixj > col) {
if ((col & k) == 0) { if ((col & k) == 0) {
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
swap(dst_row[col], dst_row[ixj]); swap(dst_row[col], dst_row[ixj]);
} }
} else { } else {
if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
swap(dst_row[col], dst_row[ixj]); swap(dst_row[col], dst_row[ixj]);
} }
} }
@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
} }
} }
static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, template <int qk, int qi, typename block_q_t, int vdr,
float *dst, const int ncols, vec_dot_q_sycl_t vec_dot_q_sycl>
const int nrows, static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
dpct::queue_ptr stream) { float *dst, const int ncols,
GGML_ASSERT(ncols % QK4_0 == 0); const int nrows,
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; dpct::queue_ptr stream) {
const sycl::range<3> block_nums(1, 1, block_num_y); GGML_ASSERT(ncols % QK4_0 == 0);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
stream->parallel_for( const sycl::range<3> block_nums(1, 1, block_num_y);
sycl::nd_range<3>(block_nums * block_dims, block_dims), const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { stream->parallel_for(
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows, ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
item_ct1); mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
}); vx, vy, dst, ncols, nrows, item_ct1);
} });
static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK4_1 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
}
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK5_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
}
static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK5_1 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
}
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK8_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
}
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
}
static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
}
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
}
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
}
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
item_ct1);
});
} }
int get_device_index_by_id(int id){ int get_device_index_by_id(int id){
@ -10977,7 +10825,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
const sycl::range<3> block_dims(1, 1, ncols); const sycl::range<3> block_dims(1, 1, ncols);
const sycl::range<3> block_nums(1, nrows, 1); const sycl::range<3> block_nums(1, nrows, 1);
if (order == GGML_SORT_ASC) { if (order == GGML_SORT_ORDER_ASC) {
/* /*
DPCT1049:44: The work-group size passed to the SYCL kernel may exceed DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query the limit. To get the device limit, query
@ -10986,9 +10834,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
stream->parallel_for( stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1); k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
}); });
} else if (order == GGML_SORT_DESC) { } else if (order == GGML_SORT_ORDER_DESC) {
/* /*
DPCT1049:45: The work-group size passed to the SYCL kernel may exceed DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query the limit. To get the device limit, query
@ -10997,7 +10845,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
stream->parallel_for( stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1); k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
}); });
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
@ -11559,12 +11407,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
dpct::memcpy_direction kind; dpct::memcpy_direction kind;
char * src_ptr; char * src_ptr;
if (src->backend == GGML_BACKEND_CPU) { if (src->backend == GGML_BACKEND_TYPE_CPU) {
kind = dpct::host_to_device; kind = dpct::host_to_device;
src_ptr = (char *) src->data; src_ptr = (char *) src->data;
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr); // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
kind = dpct::device_to_device; kind = dpct::device_to_device;
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
int id; int id;
@ -11578,11 +11426,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
} }
char * dst_ptr = (char *) dst; char * dst_ptr = (char *) dst;
const int64_t ne0 = src->ne[0]; GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
const int64_t nb0 = src->nb[0]; GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
const int64_t nb1 = src->nb[1];
const int64_t nb2 = src->nb[2];
const int64_t nb3 = src->nb[3];
const enum ggml_type type = src->type; const enum ggml_type type = src->type;
const int64_t ts = ggml_type_size(type); const int64_t ts = ggml_type_size(type);
const int64_t bs = ggml_blck_size(type); const int64_t bs = ggml_blck_size(type);
@ -12001,7 +11846,7 @@ inline void ggml_sycl_op_mul_mat_q(
// the main device has a larger memory buffer to hold the results from all GPUs // the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
@ -12098,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t row_diff = row_high - row_low; const int64_t row_diff = row_high - row_low;
// TODO: support these quantization types
GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
src0->type == GGML_TYPE_IQ2_XS ||
src0->type == GGML_TYPE_IQ3_XXS ||
src0->type == GGML_TYPE_IQ1_S));
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
break; VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
break; VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
break; VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
break; VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
break; VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K:
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
break; VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_K:
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
break; VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
break; VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
break; VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
break; VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
break; break;
@ -12148,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
const int64_t src1_ncols, const int64_t src1_padded_row_size, const int64_t src1_ncols, const int64_t src1_padded_row_size,
const dpct::queue_ptr &stream) { const dpct::queue_ptr &stream) {
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS;
const int64_t row_diff = row_high - row_low; const int64_t row_diff = row_high - row_low;
@ -12248,7 +12119,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
// the main device has a larger memory buffer to hold the results from all GPUs // the main device has a larger memory buffer to hold the results from all GPUs
// ldc == nrows of the matrix that cuBLAS writes into // ldc == nrows of the matrix that cuBLAS writes into
int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
#ifdef GGML_SYCL_F16 #ifdef GGML_SYCL_F16
bool use_fp16 = true; // TODO(Yu) SYCL capability check bool use_fp16 = true; // TODO(Yu) SYCL capability check
@ -12426,9 +12297,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
@ -12632,16 +12501,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
const bool use_src1 = src1 != nullptr; const bool use_src1 = src1 != nullptr;
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
// dd = data device // dd = data device
float * src0_ddf = nullptr; float * src0_ddf = nullptr;
@ -12696,7 +12565,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)))); main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
} }
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
dpct::get_current_device().queues_wait_and_throw())); dpct::get_current_device().queues_wait_and_throw()));
} }
@ -12758,15 +12627,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
ggml_sycl_op_mul_mat_t op, ggml_sycl_op_mul_mat_t op,
const bool convert_src1_to_q8_1) try { const bool convert_src1_to_q8_1) try {
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne10 = src1->ne[0]; GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const int64_t nrows1 = ggml_nrows(src1); const int64_t nrows1 = ggml_nrows(src1);
GGML_ASSERT(ne03 == ne13); GGML_ASSERT(ne03 == ne13);
@ -12777,8 +12640,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
const int nb2 = dst->nb[2]; const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3]; const int nb3 = dst->nb[3];
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
@ -12793,13 +12656,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src0_is_contiguous = ggml_is_contiguous(src0);
const bool src1_is_contiguous = ggml_is_contiguous(src1); const bool src1_is_contiguous = ggml_is_contiguous(src1);
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne02 > 1));
GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne03 > 1));
GGML_ASSERT(!(split && ne02 < ne12)); GGML_ASSERT(!(split && ne02 < ne12));
@ -12854,8 +12717,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
used_devices++; used_devices++;
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
ggml_sycl_set_device(get_device_id_by_index(id)); ggml_sycl_set_device(get_device_id_by_index(id));
const dpct::queue_ptr stream = g_syclStreams[id][0]; const dpct::queue_ptr stream = g_syclStreams[id][0];
@ -12919,8 +12782,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
continue; continue;
} }
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
const int64_t row_diff = row_high[id] - row_low[id]; const int64_t row_diff = row_high[id] - row_low[id];
ggml_sycl_set_device(get_device_id_by_index(id)); ggml_sycl_set_device(get_device_id_by_index(id));
@ -12946,12 +12809,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
// the main device memory buffer can be on VRAM scratch, with space for all partial results // the main device memory buffer can be on VRAM scratch, with space for all partial results
// in that case an offset on dst_ddf_i is needed // in that case an offset on dst_ddf_i is needed
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) { if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
dst_dd_i += row_low[id]; // offset is 0 if no tensor split dst_dd_i += row_low[id]; // offset is 0 if no tensor split
} }
// copy src0, src1 to device if necessary // copy src0, src1 to device if necessary
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
if (id != g_main_device_index) { if (id != g_main_device_index) {
if (convert_src1_to_q8_1) { if (convert_src1_to_q8_1) {
char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset; char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
@ -12967,14 +12830,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
src1_ncols * ne10 * sizeof(float)))); src1_ncols * ne10 * sizeof(float))));
} }
} }
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
SYCL_CHECK(ggml_sycl_cpy_tensor_2d( SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
/* /*
DPCT1010:92: SYCL uses exceptions to report errors and does DPCT1010:92: SYCL uses exceptions to report errors and does
@ -13004,10 +12867,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
if (!dst_on_device) { if (!dst_on_device) {
void * dst_off_device; void * dst_off_device;
dpct::memcpy_direction kind; dpct::memcpy_direction kind;
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
dst_off_device = dst->data; dst_off_device = dst->data;
kind = dpct::device_to_host; kind = dpct::device_to_host;
} else if (dst->backend == GGML_BACKEND_GPU) { } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
dst_off_device = dst_extra->data_device[g_main_device_index]; dst_off_device = dst_extra->data_device[g_main_device_index];
kind = dpct::device_to_device; kind = dpct::device_to_device;
} else { } else {
@ -13091,7 +12954,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
} }
} }
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
SYCL_CHECK(ggml_sycl_set_device(g_main_device)); SYCL_CHECK(ggml_sycl_set_device(g_main_device));
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
dpct::get_current_device().queues_wait_and_throw())); dpct::get_current_device().queues_wait_and_throw()));
@ -13228,7 +13091,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
const ggml_tensor *src1, const ggml_tensor *src1,
ggml_tensor *dst) try { ggml_tensor *dst) try {
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
@ -13266,7 +13129,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_is_permuted(src0)); GGML_ASSERT(!ggml_is_permuted(src0));
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -13333,27 +13196,17 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00); GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t nb01 = src0->nb[1]; GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
const int64_t ne10 = src1->ne[0]; GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const int64_t nb11 = src1->nb[1]; GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
const int64_t ne1 = ggml_nelements(src1); const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst); const int64_t ne = ggml_nelements(dst);
@ -13519,11 +13372,11 @@ catch (sycl::exception const &exc) {
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool all_on_device = const bool all_on_device =
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
(src1->backend == GGML_BACKEND_GPU) && (src1->backend == GGML_BACKEND_TYPE_GPU) &&
( dst->backend == GGML_BACKEND_GPU); ( dst->backend == GGML_BACKEND_TYPE_GPU);
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
int64_t min_compute_capability = INT_MAX; int64_t min_compute_capability = INT_MAX;
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
@ -13652,26 +13505,18 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
GGML_ASSERT(!ggml_is_transposed(src00)); GGML_ASSERT(!ggml_is_transposed(src00));
GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
const int64_t ne01 = src00->ne[1];
const int64_t ne02 = src00->ne[2];
const int64_t ne03 = src00->ne[3];
//const int64_t nb01 = src00->nb[1]; //const int64_t nb01 = src00->nb[1];
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02); GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
const int64_t ne10 = src1->ne[0]; GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
//const int64_t nb11 = src1->nb[1]; //const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
const int64_t ne1 = ggml_nelements(src1); const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst); const int64_t ne = ggml_nelements(dst);
@ -13798,7 +13643,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
if (ids->backend == GGML_BACKEND_GPU) { if (ids->backend == GGML_BACKEND_TYPE_GPU) {
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index]; const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
SYCL_CHECK(CHECK_TRY_ERROR( SYCL_CHECK(CHECK_TRY_ERROR(
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
@ -13816,20 +13661,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
ggml_tensor src1_row = *src1; ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst; ggml_tensor dst_row = *dst;
src1_row.backend = GGML_BACKEND_GPU; src1_row.backend = GGML_BACKEND_TYPE_GPU;
dst_row.backend = GGML_BACKEND_GPU; dst_row.backend = GGML_BACKEND_TYPE_GPU;
src1_row.extra = &src1_row_extra; src1_row.extra = &src1_row_extra;
dst_row.extra = &dst_row_extra; dst_row.extra = &dst_row_extra;
char * src1_original = src1->backend == GGML_BACKEND_CPU ? char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
(char *) src1->data : (char *) src1_extra->data_device[g_main_device_index]; (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
char * dst_original = dst->backend == GGML_BACKEND_CPU ? char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
(char *) dst->data : (char *) dst_extra->data_device[g_main_device_index]; (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
if (src1->ne[1] == 1) { if (src1->ne[1] == 1) {
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
//int32_t row_id; //int32_t row_id;
@ -13911,7 +13756,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
} }
} }
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
} }
} }
@ -13934,31 +13779,13 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
const int64_t ne00 = src0->ne[0]; GGML_TENSOR_BINARY_OP_LOCALS;
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t nb00 = src0->nb[0];
const int64_t nb01 = src0->nb[1];
const int64_t nb02 = src0->nb[2];
const int64_t nb03 = src0->nb[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t nb10 = src1->nb[0];
const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2];
const int64_t nb13 = src1->nb[3];
SYCL_CHECK(ggml_sycl_set_device(g_main_device)); SYCL_CHECK(ggml_sycl_set_device(g_main_device));
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@ -14060,17 +13887,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
memset(extra, 0, sizeof(*extra)); memset(extra, 0, sizeof(*extra));
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
if (backend == GGML_BACKEND_GPU && id != g_main_device_index) { if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
continue; continue;
} }
ggml_sycl_set_device(get_device_id_by_index(id)); ggml_sycl_set_device(get_device_id_by_index(id));
const dpct::queue_ptr stream = g_syclStreams[id][0]; const dpct::queue_ptr stream = g_syclStreams[id][0];
int64_t row_low, row_high; int64_t row_low, row_high;
if (backend == GGML_BACKEND_GPU) { if (backend == GGML_BACKEND_TYPE_GPU) {
row_low = 0; row_low = 0;
row_high = nrows; row_high = nrows;
} else if (backend == GGML_BACKEND_GPU_SPLIT) { } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
const int64_t rounding = get_row_rounding(tensor->type); const int64_t rounding = get_row_rounding(tensor->type);
row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
@ -14119,7 +13946,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
extra->data_device[id] = buf; extra->data_device[id] = buf;
if (backend == GGML_BACKEND_GPU_SPLIT) { if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
for (int64_t is = 0; is < MAX_STREAMS; ++is) { for (int64_t is = 0; is < MAX_STREAMS; ++is) {
SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] = SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
new sycl::event())); new sycl::event()));
@ -14136,7 +13963,7 @@ catch (sycl::exception const &exc) {
} }
void ggml_sycl_free_data(struct ggml_tensor *tensor) try { void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
return; return;
} }
@ -14189,15 +14016,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
return; return;
} }
tensor->backend = GGML_BACKEND_GPU; tensor->backend = GGML_BACKEND_TYPE_GPU;
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
const ggml_op src0_op = tensor->src[0]->op; const ggml_op src0_op = tensor->src[0]->op;
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
} }
} }
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
} }
@ -14215,7 +14042,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
SYCL_CHECK(ggml_sycl_set_device(g_main_device)); SYCL_CHECK(ggml_sycl_set_device(g_main_device));
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
size_t offset = 0; size_t offset = 0;
@ -14284,7 +14111,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
const bool inplace = tensor->view_src != nullptr; const bool inplace = tensor->view_src != nullptr;
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
size_t view_offset = 0; size_t view_offset = 0;
@ -14305,7 +14132,7 @@ catch (sycl::exception const &exc) {
} }
void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try { void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(ggml_is_contiguous(tensor)); GGML_ASSERT(ggml_is_contiguous(tensor));
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@ -14392,9 +14219,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
if (!g_sycl_loaded) return false; if (!g_sycl_loaded) return false;
ggml_sycl_func_t func; ggml_sycl_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
return false; return false;
@ -14532,14 +14359,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
return false; return false;
} }
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
ggml_sycl_set_peer_access(tensor->src[1]->ne[1]); ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
} }
if (params->ith != 0) { if (params->ith != 0) {
return true; return true;
} }
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return true; return true;
} }
func(tensor->src[0], tensor->src[1], tensor); func(tensor->src[0], tensor->src[1], tensor);
@ -14690,7 +14517,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
extra->data_device[ctx->device] = tensor->data; extra->data_device[ctx->device] = tensor->data;
tensor->backend = GGML_BACKEND_GPU; tensor->backend = GGML_BACKEND_TYPE_GPU;
tensor->extra = extra; tensor->extra = extra;
if (ggml_is_quantized(tensor->type)) { if (ggml_is_quantized(tensor->type)) {
@ -14721,7 +14548,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
ggml_tensor *tensor, ggml_tensor *tensor,
const void *data, size_t offset, const void *data, size_t offset,
size_t size) try { size_t size) try {
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
@ -14746,7 +14573,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
const ggml_tensor *tensor, const ggml_tensor *tensor,
void *data, size_t offset, void *data, size_t offset,
size_t size) try { size_t size) try {
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
@ -14815,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
static ggml_backend_buffer_t static ggml_backend_buffer_t
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
size_t size) try { size_t size) try {
int device = (int) (intptr_t) buft->context; ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
int device = (int) buft_ctx->device;
ggml_sycl_set_device(device); ggml_sycl_set_device(device);
int device_index = get_device_index_by_id(device); int device_index = get_device_index_by_id(device);
@ -14893,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
ggml_backend_sycl_buffer_types[i] = { ggml_backend_sycl_buffer_types[i] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface, /* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
}; };
} }
ggml_backend_sycl_buffer_type_initialized = true; ggml_backend_sycl_buffer_type_initialized = true;
@ -14955,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
// backend // backend
struct ggml_backend_context_sycl {
int device;
};
static const char * ggml_backend_sycl_name(ggml_backend_t backend) { static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
return GGML_SYCL_NAME; return GGML_SYCL_NAME;
@ -14966,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
} }
static void ggml_backend_sycl_free(ggml_backend_t backend) { static void ggml_backend_sycl_free(ggml_backend_t backend) {
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
delete sycl_ctx; delete sycl_ctx;
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
return ggml_backend_sycl_buffer_type(sycl_ctx->device); return ggml_backend_sycl_buffer_type(sycl_ctx->device);
} }
@ -14982,10 +14806,10 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
ggml_tensor *tensor, ggml_tensor *tensor,
const void *data, size_t offset, const void *data, size_t offset,
size_t size) try { size_t size) try {
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
(char *)tensor->data + offset, data, size))); (char *)tensor->data + offset, data, size)));
@ -15000,10 +14824,10 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
const ggml_tensor *tensor, const ggml_tensor *tensor,
void *data, size_t offset, void *data, size_t offset,
size_t size) try { size_t size) try {
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
data, (const char *)tensor->data + offset, size))); data, (const char *)tensor->data + offset, size)));
@ -15015,7 +14839,7 @@ catch (sycl::exception const &exc) {
} }
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try { static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait())); SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
@ -15051,12 +14875,12 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
} }
static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context; ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
ggml_sycl_set_main_device(sycl_ctx->device); ggml_sycl_set_main_device(sycl_ctx->device);
ggml_compute_params params = {}; ggml_compute_params params = {};
params.type = GGML_TASK_COMPUTE; params.type = GGML_TASK_TYPE_COMPUTE;
params.ith = 0; params.ith = 0;
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i]; ggml_tensor * node = cgraph->nodes[i];
@ -15064,13 +14888,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
continue; continue;
assert(node->backend == GGML_BACKEND_GPU); assert(node->backend == GGML_BACKEND_TYPE_GPU);
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
assert(node->extra != nullptr); assert(node->extra != nullptr);
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) { if (node->src[j] != nullptr) {
assert(node->src[j]->backend == GGML_BACKEND_GPU); assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
assert(node->src[j]->extra != nullptr); assert(node->src[j]->extra != nullptr);
} }
@ -15140,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
return false; return false;
} }
if (a->type == GGML_TYPE_IQ1_S) {
return false;
}
if (a->type == GGML_TYPE_IQ3_XXS) {
return false;
}
if (a->type == GGML_TYPE_IQ2_XXS) { if (a->type == GGML_TYPE_IQ2_XXS) {
return false; return false;
} }
@ -15259,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
// not strictly necessary, but it may reduce the overhead of the first graph_compute // not strictly necessary, but it may reduce the overhead of the first graph_compute
ggml_sycl_set_main_device(device); ggml_sycl_set_main_device(device);
ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl { ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
/* .device = */ device /* .device = */ device,
/* .name = */ GGML_SYCL_NAME + std::to_string(device),
}; };
ggml_backend_t sycl_backend = new ggml_backend { ggml_backend_t sycl_backend = new ggml_backend {

View file

@ -27,6 +27,7 @@
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
#define VK_VENDOR_ID_AMD 0x1002 #define VK_VENDOR_ID_AMD 0x1002
#define VK_VENDOR_ID_APPLE 0x106b
#define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_INTEL 0x8086
#define VK_VENDOR_ID_NVIDIA 0x10de #define VK_VENDOR_ID_NVIDIA 0x10de
@ -773,9 +774,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
q.cmd_buffer_idx = 0; q.cmd_buffer_idx = 0;
} }
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
vk::MemoryType memory_type = mem_props->memoryTypes[i];
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
(flags & memory_type.propertyFlags) == flags &&
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
return static_cast<int32_t>(i);
}
}
return UINT32_MAX;
}
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl; std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
#endif #endif
vk_buffer buf = std::make_shared<vk_buffer_struct>(); vk_buffer buf = std::make_shared<vk_buffer_struct>();
@ -802,15 +815,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
uint32_t memory_type_index = UINT32_MAX; uint32_t memory_type_index = UINT32_MAX;
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) { memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
vk::MemoryType memory_type = mem_props.memoryTypes[i]; buf->memory_property_flags = req_flags;
if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) {
memory_type_index = i; if (memory_type_index == UINT32_MAX && fallback_flags) {
break; memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
} buf->memory_property_flags = fallback_flags;
} }
if (memory_type_index >= mem_props.memoryTypeCount) { if (memory_type_index == UINT32_MAX) {
ctx->device->device.destroyBuffer(buf->buffer); ctx->device->device.destroyBuffer(buf->buffer);
buf->size = 0; buf->size = 0;
throw vk::OutOfDeviceMemoryError("No suitable memory type found"); throw vk::OutOfDeviceMemoryError("No suitable memory type found");
@ -824,10 +837,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
buf->size = 0; buf->size = 0;
throw e; throw e;
} }
buf->memory_property_flags = req_flags;
buf->ptr = nullptr; buf->ptr = nullptr;
if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) { if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
buf->ptr = ctx->device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); buf->ptr = ctx->device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
} }
@ -844,9 +856,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
return buf; return buf;
} }
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
try { try {
return ggml_vk_create_buffer(ctx, size, req_flags); return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
} catch (const vk::SystemError& e) { } catch (const vk::SystemError& e) {
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
std::cerr << "ggml_vulkan: " << e.what() << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl;
@ -857,16 +869,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) { static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
vk_buffer buf; vk_buffer buf;
try { try {
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
} catch (const vk::SystemError& e) {
if (ctx->device->uma) { if (ctx->device->uma) {
// Fall back to host memory type // Fall back to host memory type
buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
} else { } else {
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
throw e;
} }
} catch (const vk::SystemError& e) {
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
throw e;
} }
return buf; return buf;
@ -1155,6 +1167,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
} }
} }
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
void ggml_vk_instance_init() { void ggml_vk_instance_init() {
if (vk_instance_initialized) { if (vk_instance_initialized) {
return; return;
@ -1164,28 +1179,42 @@ void ggml_vk_instance_init() {
#endif #endif
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
const std::vector<const char*> layers = {
#ifdef GGML_VULKAN_VALIDATE
"VK_LAYER_KHRONOS_validation",
#endif
};
const std::vector<const char*> extensions = {
#ifdef GGML_VULKAN_VALIDATE
"VK_EXT_validation_features",
#endif
};
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
#ifdef GGML_VULKAN_VALIDATE
const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
vk::ValidationFeaturesEXT validation_features = {
features_enable,
{},
};
validation_features.setPNext(nullptr);
instance_create_info.setPNext(&validation_features);
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
#endif const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
std::vector<const char*> layers;
if (validation_ext) {
layers.push_back("VK_LAYER_KHRONOS_validation");
}
std::vector<const char*> extensions;
if (validation_ext) {
extensions.push_back("VK_EXT_validation_features");
}
if (portability_enumeration_ext) {
extensions.push_back("VK_KHR_portability_enumeration");
}
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
if (portability_enumeration_ext) {
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
}
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
vk::ValidationFeaturesEXT validation_features;
if (validation_ext) {
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
validation_features = {
features_enable,
{},
};
validation_features.setPNext(nullptr);
instance_create_info.setPNext(&validation_features);
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
}
vk_instance.instance = vk::createInstance(instance_create_info); vk_instance.instance = vk::createInstance(instance_create_info);
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES); memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
@ -1214,7 +1243,7 @@ void ggml_vk_instance_init() {
vk_instance_initialized = true; vk_instance_initialized = true;
} }
void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
GGML_ASSERT(idx < vk_instance.device_indices.size()); GGML_ASSERT(idx < vk_instance.device_indices.size());
size_t dev_num = vk_instance.device_indices[idx]; size_t dev_num = vk_instance.device_indices[idx];
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
@ -1232,12 +1261,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
ctx->device = ggml_vk_get_device(idx); ctx->device = ggml_vk_get_device(idx);
if (!ctx->device->initialized) { if (!ctx->device->initialized) {
ctx->device->physical_device = devices[dev_num]; ctx->device->physical_device = devices[dev_num];
std::vector<vk::ExtensionProperties> ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties(); const std::vector<vk::ExtensionProperties> ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties();
bool maintenance4_support = false; bool maintenance4_support = false;
// Check if maintenance4 is supported // Check if maintenance4 is supported
for (auto properties : ext_props) { for (const auto& properties : ext_props) {
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
maintenance4_support = true; maintenance4_support = true;
} }
@ -1268,6 +1297,26 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
bool fp16_storage = false; bool fp16_storage = false;
bool fp16_compute = false; bool fp16_compute = false;
for (const auto& properties : ext_props) {
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
fp16_storage = true;
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
fp16_compute = true;
}
}
ctx->device->physical_device.getProperties2(&props2);
ctx->device->properties = props2.properties;
if (maintenance4_support) {
ctx->device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
} else {
ctx->device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
}
ctx->device->vendor_id = ctx->device->properties.vendorID;
ctx->device->subgroup_size = subgroup_props.subgroupSize;
ctx->device->uma = ctx->device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
for (auto properties : ext_props) { for (auto properties : ext_props) {
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
fp16_storage = true; fp16_storage = true;
@ -1505,7 +1554,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
#endif #endif
vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); vk_buffer buf = ggml_vk_create_buffer(ctx, size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
@ -1651,7 +1702,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
ggml_vk_destroy_buffer(ctx->sync_staging); ggml_vk_destroy_buffer(ctx->sync_staging);
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
} }
} }
@ -1988,8 +2041,6 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
// Copy within the device // Copy within the device
ggml_backend_vk_context * ctx = src->ctx; ggml_backend_vk_context * ctx = src->ctx;
VkBufferCopy bc{ src_offset, dst_offset, size };
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
ggml_vk_ctx_begin(ctx, subctx); ggml_vk_ctx_begin(ctx, subctx);
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
@ -2120,18 +2171,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
return ctx->device->pipeline_matmul_f32_aligned_l->align; return ctx->device->pipeline_matmul_f32_aligned_l->align;
} }
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
#ifdef GGML_VULKAN_DEBUG
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
#endif
if (bit16_x && bit16_y) { if (bit16_x && bit16_y) {
if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { if (m <= 32 || n <= 32) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << " S" << std::endl; std::cerr << " S" << std::endl;
#endif #endif
return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s; return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s;
} }
if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG
std::cerr << " M" << std::endl;
#endif
return aligned ? ctx->device->pipeline_matmul_f16_aligned_m : ctx->device->pipeline_matmul_f16_m;
}
if (bit16_x && !bit16_y) {
if (m <= 32 || n <= 32) {
#ifdef GGML_VULKAN_DEBUG
std::cerr << " S" << std::endl;
#endif
return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s;
}
#ifdef GGML_VULKAN_DEBUG
std::cerr << " M" << std::endl;
#endif
return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_m : ctx->device->pipeline_matmul_f16_f32_m;
}
if (!bit16_x && bit16_y) {
GGML_ASSERT(false);
}
if (m <= 32 || n <= 32) {
#ifdef GGML_VULKAN_DEBUG
std::cerr << " S" << std::endl;
#endif
return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s;
}
#ifdef GGML_VULKAN_DEBUG
std::cerr << " M" << std::endl;
#endif
return aligned ? ctx->device->pipeline_matmul_f32_aligned_m : ctx->device->pipeline_matmul_f32_m;
}
static vk_pipeline ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
#ifdef GGML_VULKAN_DEBUG
std::cerr << " M" << std::endl;
#endif
if (bit16_x && bit16_y) {
return aligned ? ctx->device->pipeline_matmul_f16_aligned_m : ctx->device->pipeline_matmul_f16_m;
}
if (bit16_x && !bit16_y) {
return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_m : ctx->device->pipeline_matmul_f16_f32_m;
}
if (!bit16_x && bit16_y) {
GGML_ASSERT(false);
}
return aligned ? ctx->device->pipeline_matmul_f32_aligned_m : ctx->device->pipeline_matmul_f32_m;
}
static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
#ifdef GGML_VULKAN_DEBUG
std::cerr << " S" << std::endl;
#endif
if (bit16_x && bit16_y) {
return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s;
}
if (bit16_x && !bit16_y) {
return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s;
}
if (!bit16_x && bit16_y) {
GGML_ASSERT(false);
}
return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s;
}
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
#ifdef GGML_VULKAN_DEBUG
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
#endif
switch (ctx->device->vendor_id) {
case VK_VENDOR_ID_AMD:
return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
case VK_VENDOR_ID_APPLE:
return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
case VK_VENDOR_ID_INTEL:
return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
}
if (bit16_x && bit16_y) {
if (m <= 32 || n <= 32) {
#ifdef GGML_VULKAN_DEBUG
std::cerr << " S" << std::endl;
#endif
return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s;
}
if (m <= 64 || n <= 64) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << " M" << std::endl; std::cerr << " M" << std::endl;
#endif #endif
@ -2143,13 +2276,13 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
return aligned ? ctx->device->pipeline_matmul_f16_aligned_l : ctx->device->pipeline_matmul_f16_l; return aligned ? ctx->device->pipeline_matmul_f16_aligned_l : ctx->device->pipeline_matmul_f16_l;
} }
if (bit16_x && !bit16_y) { if (bit16_x && !bit16_y) {
if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { if (m <= 32 || n <= 32) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << " S" << std::endl; std::cerr << " S" << std::endl;
#endif #endif
return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s; return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s;
} }
if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) { if (m <= 64 || n <= 64) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << " M" << std::endl; std::cerr << " M" << std::endl;
#endif #endif
@ -2164,13 +2297,13 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
GGML_ASSERT(false); GGML_ASSERT(false);
} }
if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { if (m <= 32 || n <= 32) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << " S" << std::endl; std::cerr << " S" << std::endl;
#endif #endif
return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s; return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s;
} }
if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) { if (m <= 64 || n <= 64) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << " M" << std::endl; std::cerr << " M" << std::endl;
#endif #endif
@ -2225,13 +2358,12 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
GGML_ASSERT(false); GGML_ASSERT(false);
} }
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type) { static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl; std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
#endif #endif
const int tensor_type_size = ggml_type_size(tensor->type); const int tensor_type_size = ggml_type_size(tensor->type);
const int dst_type_size = ggml_type_size(buffer_type);
const uint32_t ne = ggml_nelements(tensor); const uint32_t ne = ggml_nelements(tensor);
@ -2290,8 +2422,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
src1_uma = d_Qy != nullptr; src1_uma = d_Qy != nullptr;
} }
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@ -2389,7 +2521,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
} }
if (x_non_contig) { if (x_non_contig) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, dst->type); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
} else if (load_x || qx_needs_dequant) { } else if (load_x || qx_needs_dequant) {
if (load_x) { if (load_x) {
// copy data to device // copy data to device
@ -2404,7 +2536,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
} }
} }
if (y_non_contig) { if (y_non_contig) {
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, dst->type); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
} else if (load_y) { } else if (load_y) {
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
} }
@ -2423,7 +2555,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
// compute // compute
ggml_vk_matmul(ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT ggml_vk_matmul(ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
// copy dst to host // copy dst to host
float * d = (float *) ((char *) dst->data); float * d = (float *) ((char *) dst->data);
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13); ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
@ -2476,8 +2608,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
src1_uma = d_Qy != nullptr; src1_uma = d_Qy != nullptr;
} }
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@ -2559,14 +2691,14 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
if (x_non_contig) { if (x_non_contig) {
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, src0->type); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
} else if (load_x) { } else if (load_x) {
// copy data to device // copy data to device
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
} }
if (y_non_contig) { if (y_non_contig) {
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, src1->type); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
} else if (load_y) { } else if (load_y) {
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
} }
@ -2600,7 +2732,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
// copy dst to host // copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
@ -2617,7 +2749,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
#endif #endif
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
@ -2649,7 +2781,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
src1_uma = d_Qy != nullptr; src1_uma = d_Qy != nullptr;
} }
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
const uint64_t x_ne = ne00 * ne01 * ne02; const uint64_t x_ne = ne00 * ne01 * ne02;
const uint64_t y_ne = ne10 * ne11 * ne12; const uint64_t y_ne = ne10 * ne11 * ne12;
@ -2691,7 +2823,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
// copy dst to host // copy dst to host
float * d = (float *) dst->data; float * d = (float *) dst->data;
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
@ -2708,7 +2840,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_is_permuted(src0)); GGML_ASSERT(!ggml_is_permuted(src0));
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -2741,7 +2873,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
src1_uma = d_Qy != nullptr; src1_uma = d_Qy != nullptr;
} }
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
const uint64_t d_ne = ne01 * ne11 * ne12; const uint64_t d_ne = ne01 * ne11 * ne12;
@ -2784,7 +2916,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
// copy dst to host // copy dst to host
float * d = (float *) dst->data; float * d = (float *) dst->data;
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
@ -2802,7 +2934,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
dst->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU); ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
} }
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@ -2820,9 +2952,9 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
} }
} }
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { // static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
//
} // }
static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
// guaranteed to be an integer due to the check in ggml_can_repeat // guaranteed to be an integer due to the check in ggml_can_repeat
@ -2854,8 +2986,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
// TODO: support for transposed / permuted tensors // TODO: support for transposed / permuted tensors
GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@ -3088,8 +3220,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
} }
} }
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma; const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma; const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment); uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0; uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
@ -3098,7 +3230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
// Workaround for tiny tensor inputs on ROPE // Workaround for tiny tensor inputs on ROPE
if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) { if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
y_sz = VK_WHOLE_SIZE; y_sz = VK_WHOLE_SIZE;
} }
@ -3187,9 +3319,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} }
if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) { if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst); ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
} else if(dst->backend == GGML_BACKEND_CPU) { } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
// copy dst to host // copy dst to host
float * d = (float *) dst->data; float * d = (float *) dst->data;
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz); ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
@ -3232,7 +3364,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
} }
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
// copy dst to host // copy dst to host
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz); ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
} }
@ -3259,7 +3391,8 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0, 0, 0,
0.0f, 0.0f,
}); });
} }
@ -3273,7 +3406,8 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0, 0, 0,
0.0f, 0.0f,
}); });
} }
@ -3387,12 +3521,12 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
int32_t * op_params = (int32_t *)dst->op_params; int32_t * op_params = (int32_t *)dst->op_params;
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ASC }); ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
} }
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
// If backend is CPU, data from src0 has to be copied off the device // If backend is CPU, data from src0 has to be copied off the device
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_TYPE_CPU) {
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
vk_buffer d_D = extra_src0->buffer_gpu.lock(); vk_buffer d_D = extra_src0->buffer_gpu.lock();
ggml_vk_sync_buffers(subctx); ggml_vk_sync_buffers(subctx);
@ -4198,9 +4332,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
#endif #endif
const bool any_on_device = node->backend == GGML_BACKEND_GPU const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU)); || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) { if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
return; return;
@ -4320,7 +4454,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
#endif #endif
#if defined(GGML_VULKAN_RUN_TESTS) #if defined(GGML_VULKAN_RUN_TESTS)
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
ggml_vk_test_transfer(ctx, 8192 * 1000, false); ggml_vk_test_transfer(ctx, 8192 * 1000, false);
ggml_vk_test_transfer(ctx, 8192 * 1000, true); ggml_vk_test_transfer(ctx, 8192 * 1000, true);
@ -4417,14 +4553,16 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
if (ctx->staging != nullptr) { if (ctx->staging != nullptr) {
ggml_vk_destroy_buffer(ctx->staging); ggml_vk_destroy_buffer(ctx->staging);
} }
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
} }
} }
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
const bool any_on_device = node->backend == GGML_BACKEND_GPU const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU); || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
return; return;
@ -4571,7 +4709,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
break; break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, node); //ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, node);
std::cerr << "ggml_vulkan: GGML_OP_MUL_MAT_ID not implemented yet." << std::endl;
GGML_ASSERT(false);
break; break;
default: default:
@ -4587,7 +4727,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
last_node = true; last_node = true;
#endif #endif
if (node->backend == GGML_BACKEND_CPU || last_node) { if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
ggml_vk_ctx_end(ctx->compute_ctx); ggml_vk_ctx_end(ctx->compute_ctx);
ctx->compute_ctx->exit_tensor = node; ctx->compute_ctx->exit_tensor = node;
ctx->compute_ctx = nullptr; ctx->compute_ctx = nullptr;
@ -4595,9 +4735,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
} }
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) { if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
return false; return false;
@ -4660,7 +4800,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
if (params->ith != 0) { if (params->ith != 0) {
return true; return true;
} }
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return true; return true;
} }
@ -4786,13 +4926,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
ctx->device->device.destroyFence(ctx->fence); ctx->device->device.destroyFence(ctx->fence);
} }
GGML_CALL int ggml_vk_get_device_count() { GGML_CALL static int ggml_vk_get_device_count() {
ggml_vk_instance_init(); ggml_vk_instance_init();
return vk_instance.device_indices.size(); return vk_instance.device_indices.size();
} }
GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) { GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
ggml_vk_instance_init(); ggml_vk_instance_init();
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices(); std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@ -4810,7 +4950,7 @@ void ggml_vk_init_cpu_assist() {
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl; std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
for (size_t i = 0; i < ggml_vk_get_device_count(); i++) { for (int i = 0; i < ggml_vk_get_device_count(); i++) {
ggml_vk_print_gpu_info(i); ggml_vk_print_gpu_info(i);
} }
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded. // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
@ -4958,7 +5098,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
} }
tensor->backend = GGML_BACKEND_GPU; tensor->backend = GGML_BACKEND_TYPE_GPU;
tensor->extra = extra; tensor->extra = extra;
} }
@ -4966,7 +5106,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
#endif #endif
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
@ -4981,7 +5121,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
#ifdef GGML_VULKAN_DEBUG #ifdef GGML_VULKAN_DEBUG
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
#endif #endif
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
@ -4994,7 +5134,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
if (ggml_backend_buffer_is_vk(src->buffer)) { if (ggml_backend_buffer_is_vk(src->buffer)) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
@ -5211,7 +5350,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
#endif #endif
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@ -5232,7 +5371,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
#endif #endif
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@ -5309,7 +5448,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
int last_node = cgraph->n_nodes - 1; int last_node = cgraph->n_nodes - 1;
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) { while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
last_node -= 1; last_node -= 1;
} }
@ -5318,7 +5457,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
} }
ggml_compute_params params = {}; ggml_compute_params params = {};
params.type = GGML_TASK_COMPUTE; params.type = GGML_TASK_TYPE_COMPUTE;
params.ith = 0; params.ith = 0;
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i]; ggml_tensor * node = cgraph->nodes[i];
@ -5498,7 +5637,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
} }
GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
GGML_ASSERT(device < vk_instance.device_indices.size()); GGML_ASSERT(device < (int) vk_instance.device_indices.size());
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
@ -5532,6 +5671,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
return vk_instance.device_indices.size(); return vk_instance.device_indices.size();
} }
// Extension availability
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
#ifdef GGML_VULKAN_VALIDATE
bool portability_enumeration_ext = false;
// Check for portability enumeration extension for MoltenVK support
for (const auto& properties : instance_extensions) {
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
return true;
}
}
if (!portability_enumeration_ext) {
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
}
#endif
return false;
UNUSED(instance_extensions);
}
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
#ifdef __APPLE__
bool portability_enumeration_ext = false;
// Check for portability enumeration extension for MoltenVK support
for (const auto& properties : instance_extensions) {
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
return true;
}
}
if (!portability_enumeration_ext) {
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
}
#endif
return false;
UNUSED(instance_extensions);
}
// checks // checks
#ifdef GGML_VULKAN_CHECK_RESULTS #ifdef GGML_VULKAN_CHECK_RESULTS
@ -5588,7 +5763,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
void * tensor_data = tensor->data; void * tensor_data = tensor->data;
if (tensor->backend == GGML_BACKEND_GPU) { if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
const size_t tensor_size = ggml_nbytes(tensor); const size_t tensor_size = ggml_nbytes(tensor);
tensor_data = malloc(tensor_size); tensor_data = malloc(tensor_size);
@ -5615,14 +5790,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
std::vector<const ggml_tensor *> done; std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done); ggml_vk_print_graph_origin(tensor, done);
if (tensor->backend == GGML_BACKEND_GPU) { if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
free(tensor_data); free(tensor_data);
} }
} }
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) { static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
return; return;
GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
return; return;
} }
@ -5660,7 +5835,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
if (params->ith != 0) { if (params->ith != 0) {
return; return;
} }
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
return; return;
} }
@ -5697,10 +5872,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
src0_buffer = malloc(src0_size); src0_buffer = malloc(src0_size);
src0_clone->data = src0_buffer; src0_clone->data = src0_buffer;
if (src0->backend == GGML_BACKEND_CPU) { if (src0->backend == GGML_BACKEND_TYPE_CPU) {
memcpy(src0_clone->data, src0->data, src0_size); memcpy(src0_clone->data, src0->data, src0_size);
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
} else if (src0->backend == GGML_BACKEND_GPU) { } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
uint64_t offset = extra->offset; uint64_t offset = extra->offset;
@ -5741,10 +5916,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
src1_buffer = malloc(src1_size); src1_buffer = malloc(src1_size);
src1_clone->data = src1_buffer; src1_clone->data = src1_buffer;
if (src1->backend == GGML_BACKEND_CPU) { if (src1->backend == GGML_BACKEND_TYPE_CPU) {
memcpy(src1_clone->data, src1->data, src1_size); memcpy(src1_clone->data, src1->data, src1_size);
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
} else if (src1->backend == GGML_BACKEND_GPU) { } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
uint64_t offset = extra->offset; uint64_t offset = extra->offset;
@ -5904,7 +6079,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
if (params->ith != 0) { if (params->ith != 0) {
return; return;
} }
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
return; return;
} }
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
@ -5916,7 +6091,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
void * tensor_data = tensor->data; void * tensor_data = tensor->data;
if (tensor->backend == GGML_BACKEND_GPU) { if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
size_t tensor_size = ggml_nbytes(tensor); size_t tensor_size = ggml_nbytes(tensor);
tensor_data = malloc(tensor_size); tensor_data = malloc(tensor_size);
@ -6050,7 +6225,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
comp_result = nullptr; comp_result = nullptr;
comp_size = 0; comp_size = 0;
if (tensor->backend == GGML_BACKEND_GPU) { if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
free(tensor_data); free(tensor_data);
} }
} }

View file

@ -10,6 +10,7 @@ extern "C" {
#define GGML_VK_NAME "Vulkan" #define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16 #define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void);
GGML_API void ggml_vk_init_cpu_assist(void); GGML_API void ggml_vk_init_cpu_assist(void);
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node); GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);

2027
ggml.c

File diff suppressed because it is too large Load diff

107
ggml.h
View file

@ -315,13 +315,7 @@
extern "C" { extern "C" {
#endif #endif
#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t; typedef uint16_t ggml_fp16_t;
#endif
// convert FP16 <-> FP32 // convert FP16 <-> FP32
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
@ -354,6 +348,9 @@ extern "C" {
GGML_TYPE_IQ2_XXS = 16, GGML_TYPE_IQ2_XXS = 16,
GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ2_XS = 17,
GGML_TYPE_IQ3_XXS = 18, GGML_TYPE_IQ3_XXS = 18,
GGML_TYPE_IQ1_S = 19,
GGML_TYPE_IQ4_NL = 20,
GGML_TYPE_IQ3_S = 21,
GGML_TYPE_I8, GGML_TYPE_I8,
GGML_TYPE_I16, GGML_TYPE_I16,
GGML_TYPE_I32, GGML_TYPE_I32,
@ -367,9 +364,9 @@ extern "C" {
}; };
enum ggml_backend_type { enum ggml_backend_type {
GGML_BACKEND_CPU = 0, GGML_BACKEND_TYPE_CPU = 0,
GGML_BACKEND_GPU = 10, GGML_BACKEND_TYPE_GPU = 10,
GGML_BACKEND_GPU_SPLIT = 20, GGML_BACKEND_TYPE_GPU_SPLIT = 20,
}; };
// model file types // model file types
@ -391,6 +388,9 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
}; };
// available tensor operations: // available tensor operations:
@ -498,18 +498,24 @@ extern "C" {
}; };
enum ggml_object_type { enum ggml_object_type {
GGML_OBJECT_TENSOR, GGML_OBJECT_TYPE_TENSOR,
GGML_OBJECT_GRAPH, GGML_OBJECT_TYPE_GRAPH,
GGML_OBJECT_WORK_BUFFER GGML_OBJECT_TYPE_WORK_BUFFER
}; };
enum ggml_log_level { enum ggml_log_level {
GGML_LOG_LEVEL_ERROR = 2, GGML_LOG_LEVEL_ERROR = 2,
GGML_LOG_LEVEL_WARN = 3, GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_INFO = 4, GGML_LOG_LEVEL_INFO = 4,
GGML_LOG_LEVEL_DEBUG = 5 GGML_LOG_LEVEL_DEBUG = 5
}; };
enum ggml_tensor_flag {
GGML_TENSOR_FLAG_INPUT = 1,
GGML_TENSOR_FLAG_OUTPUT = 2,
GGML_TENSOR_FLAG_PARAM = 4,
};
// ggml object // ggml object
struct ggml_object { struct ggml_object {
size_t offs; size_t offs;
@ -543,7 +549,7 @@ extern "C" {
// op params - allocated as int32_t for alignment // op params - allocated as int32_t for alignment
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
bool is_param; int32_t flags;
struct ggml_tensor * grad; struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC]; struct ggml_tensor * src[GGML_MAX_SRC];
@ -567,6 +573,11 @@ extern "C" {
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
// Abort callback
// If not NULL, called before ggml computation
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan { struct ggml_cplan {
@ -576,8 +587,8 @@ extern "C" {
int n_threads; int n_threads;
// abort ggml_graph_compute when true // abort ggml_graph_compute when true
bool (*abort_callback)(void * data); ggml_abort_callback abort_callback;
void * abort_callback_data; void * abort_callback_data;
}; };
enum ggml_cgraph_eval_order { enum ggml_cgraph_eval_order {
@ -631,9 +642,9 @@ extern "C" {
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type { enum ggml_task_type {
GGML_TASK_INIT = 0, GGML_TASK_TYPE_INIT = 0,
GGML_TASK_COMPUTE, GGML_TASK_TYPE_COMPUTE,
GGML_TASK_FINALIZE, GGML_TASK_TYPE_FINALIZE,
}; };
struct ggml_compute_params { struct ggml_compute_params {
@ -647,6 +658,16 @@ extern "C" {
void * wdata; void * wdata;
}; };
// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
GGML_NUMA_STRATEGY_COUNT
};
// misc // misc
GGML_API void ggml_time_init(void); // call this once at the beginning of the program GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@ -657,7 +678,7 @@ extern "C" {
GGML_API void ggml_print_backtrace(void); GGML_API void ggml_print_backtrace(void);
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_object (const struct ggml_object * obj);
@ -1362,13 +1383,17 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// fused soft_max(a*scale + mask) // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
// mask is optional // mask is optional
// pos is required when max_bias > 0.0f
// max_bias = 0.0f for no ALiBi
GGML_API struct ggml_tensor * ggml_soft_max_ext( GGML_API struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * mask, struct ggml_tensor * mask,
float scale); struct ggml_tensor * pos,
float scale,
float max_bias);
GGML_API struct ggml_tensor * ggml_soft_max_back( GGML_API struct ggml_tensor * ggml_soft_max_back(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1470,12 +1495,13 @@ extern "C" {
// alibi position embedding // alibi position embedding
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_alibi( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_head, int n_head,
float bias_max); float bias_max),
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
// clamp // clamp
// in-place, returns view(a) // in-place, returns view(a)
@ -1623,8 +1649,8 @@ extern "C" {
// sort rows // sort rows
enum ggml_sort_order { enum ggml_sort_order {
GGML_SORT_ASC, GGML_SORT_ORDER_ASC,
GGML_SORT_DESC, GGML_SORT_ORDER_DESC,
}; };
GGML_API struct ggml_tensor * ggml_argsort( GGML_API struct ggml_tensor * ggml_argsort(
@ -1917,8 +1943,8 @@ extern "C" {
// optimization methods // optimization methods
enum ggml_opt_type { enum ggml_opt_type {
GGML_OPT_ADAM, GGML_OPT_TYPE_ADAM,
GGML_OPT_LBFGS, GGML_OPT_TYPE_LBFGS,
}; };
// linesearch methods // linesearch methods
@ -1932,12 +1958,12 @@ extern "C" {
// optimization return values // optimization return values
enum ggml_opt_result { enum ggml_opt_result {
GGML_OPT_OK = 0, GGML_OPT_RESULT_OK = 0,
GGML_OPT_DID_NOT_CONVERGE, GGML_OPT_RESULT_DID_NOT_CONVERGE,
GGML_OPT_NO_CONTEXT, GGML_OPT_RESULT_NO_CONTEXT,
GGML_OPT_INVALID_WOLFE, GGML_OPT_RESULT_INVALID_WOLFE,
GGML_OPT_FAIL, GGML_OPT_RESULT_FAIL,
GGML_OPT_CANCEL, GGML_OPT_RESULT_CANCEL,
GGML_LINESEARCH_FAIL = -128, GGML_LINESEARCH_FAIL = -128,
GGML_LINESEARCH_MINIMUM_STEP, GGML_LINESEARCH_MINIMUM_STEP,
@ -2087,6 +2113,12 @@ extern "C" {
ggml_opt_callback callback, ggml_opt_callback callback,
void * callback_data); void * callback_data);
//
// tensor flags
//
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
// //
// quantization // quantization
// //
@ -2273,6 +2305,7 @@ extern "C" {
GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
// //
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks
@ -2286,7 +2319,8 @@ extern "C" {
#endif #endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef struct { typedef struct {
const char * type_name; const char * type_name;
@ -2298,6 +2332,7 @@ extern "C" {
ggml_from_float_t from_float_reference; ggml_from_float_t from_float_reference;
ggml_vec_dot_t vec_dot; ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type; enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
} ggml_type_traits_t; } ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python3
import sys
from pathlib import Path
from gguf.gguf_reader import GGUFReader
sys.path.insert(0, str(Path(__file__).parent.parent))
def read_gguf_file(gguf_file_path):
"""
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
Parameters:
- gguf_file_path: Path to the GGUF file.
"""
reader = GGUFReader(gguf_file_path)
# List all key-value pairs in a columnized format
print("Key-Value Pairs:")
max_key_length = max(len(key) for key in reader.fields.keys())
for key, field in reader.fields.items():
value = field.parts[field.data[0]]
print(f"{key:{max_key_length}} : {value}")
print("----")
# List all tensors
print("Tensors:")
tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization"))
print("-" * 80)
for tensor in reader.tensors:
shape_str = "x".join(map(str, tensor.shape))
size_str = str(tensor.n_elements)
quantization_str = tensor.tensor_type.name
print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str))
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: reader.py <path_to_gguf_file>")
sys.exit(1)
gguf_file_path = sys.argv[1]
read_gguf_file(gguf_file_path)

View file

@ -40,6 +40,7 @@ class Keys:
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
EXPERT_COUNT = "{arch}.expert_count" EXPERT_COUNT = "{arch}.expert_count"
EXPERT_USED_COUNT = "{arch}.expert_used_count" EXPERT_USED_COUNT = "{arch}.expert_used_count"
POOLING_TYPE = "{arch}.pooling_type"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -50,6 +51,7 @@ class Keys:
VALUE_LENGTH = "{arch}.attention.value_length" VALUE_LENGTH = "{arch}.attention.value_length"
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
CAUSAL = "{arch}.attention.causal"
class Rope: class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_COUNT = "{arch}.rope.dimension_count"
@ -60,22 +62,25 @@ class Keys:
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
class Tokenizer: class Tokenizer:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model"
LIST = "tokenizer.ggml.tokens" LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE = "tokenizer.ggml.token_type"
SCORES = "tokenizer.ggml.scores" TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
MERGES = "tokenizer.ggml.merges" SCORES = "tokenizer.ggml.scores"
BOS_ID = "tokenizer.ggml.bos_token_id" MERGES = "tokenizer.ggml.merges"
EOS_ID = "tokenizer.ggml.eos_token_id" BOS_ID = "tokenizer.ggml.bos_token_id"
UNK_ID = "tokenizer.ggml.unknown_token_id" EOS_ID = "tokenizer.ggml.eos_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token" PAD_ID = "tokenizer.ggml.padding_token_id"
ADD_EOS = "tokenizer.ggml.add_eos_token" CLS_ID = "tokenizer.ggml.cls_token_id"
ADD_PREFIX = "tokenizer.ggml.add_space_prefix" MASK_ID = "tokenizer.ggml.mask_token_id"
HF_JSON = "tokenizer.huggingface.json" ADD_BOS = "tokenizer.ggml.add_bos_token"
RWKV = "tokenizer.rwkv.world" ADD_EOS = "tokenizer.ggml.add_eos_token"
CHAT_TEMPLATE = "tokenizer.chat_template" ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world"
CHAT_TEMPLATE = "tokenizer.chat_template"
# #
@ -84,27 +89,29 @@ class Keys:
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):
LLAMA = auto() LLAMA = auto()
FALCON = auto() FALCON = auto()
BAICHUAN = auto() BAICHUAN = auto()
GPT2 = auto() GPT2 = auto()
GPTJ = auto() GPTJ = auto()
GPTNEOX = auto() GPTNEOX = auto()
MPT = auto() MPT = auto()
STARCODER = auto() STARCODER = auto()
PERSIMMON = auto() PERSIMMON = auto()
REFACT = auto() REFACT = auto()
BERT = auto() BERT = auto()
BLOOM = auto() NOMIC_BERT = auto()
STABLELM = auto() BLOOM = auto()
QWEN = auto() STABLELM = auto()
QWEN2 = auto() QWEN = auto()
PHI2 = auto() QWEN2 = auto()
PLAMO = auto() PHI2 = auto()
CODESHELL = auto() PLAMO = auto()
ORION = auto() CODESHELL = auto()
ORION = auto()
INTERNLM2 = auto() INTERNLM2 = auto()
MINICPM = auto() MINICPM = auto()
GEMMA = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -122,6 +129,7 @@ class MODEL_TENSOR(IntEnum):
ATTN_OUT = auto() ATTN_OUT = auto()
ATTN_NORM = auto() ATTN_NORM = auto()
ATTN_NORM_2 = auto() ATTN_NORM_2 = auto()
ATTN_OUT_NORM = auto()
ATTN_ROT_EMBD = auto() ATTN_ROT_EMBD = auto()
FFN_GATE_INP = auto() FFN_GATE_INP = auto()
FFN_NORM = auto() FFN_NORM = auto()
@ -134,6 +142,7 @@ class MODEL_TENSOR(IntEnum):
FFN_UP_EXP = auto() FFN_UP_EXP = auto()
ATTN_Q_NORM = auto() ATTN_Q_NORM = auto()
ATTN_K_NORM = auto() ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -148,6 +157,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.PERSIMMON: "persimmon", MODEL_ARCH.PERSIMMON: "persimmon",
MODEL_ARCH.REFACT: "refact", MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert", MODEL_ARCH.BERT: "bert",
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN: "qwen",
@ -158,6 +168,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.ORION: "orion", MODEL_ARCH.ORION: "orion",
MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.INTERNLM2: "internlm2",
MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.MINICPM: "minicpm",
MODEL_ARCH.GEMMA: "gemma",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -178,6 +189,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
@ -187,6 +199,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -262,17 +275,32 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
], ],
MODEL_ARCH.BERT: [ MODEL_ARCH.BERT: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.TOKEN_TYPES, MODEL_TENSOR.TOKEN_TYPES,
MODEL_TENSOR.POS_EMBD, MODEL_TENSOR.POS_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_OUT_NORM,
MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM,
],
MODEL_ARCH.NOMIC_BERT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.TOKEN_TYPES,
MODEL_TENSOR.POS_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_OUT_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM,
], ],
MODEL_ARCH.MPT: [ MODEL_ARCH.MPT: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
@ -485,6 +513,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_UP_EXP,
], ],
MODEL_ARCH.GEMMA: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_NORM,
],
# TODO # TODO
} }
@ -535,6 +576,12 @@ class RopeScalingType(Enum):
YARN = 'yarn' YARN = 'yarn'
class PoolingType(IntEnum):
NONE = 0
MEAN = 1
CLS = 2
class GGMLQuantizationType(IntEnum): class GGMLQuantizationType(IntEnum):
F32 = 0 F32 = 0
F16 = 1 F16 = 1
@ -661,5 +708,7 @@ KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV

Some files were not shown because too many files have changed in this diff Show more