Merge branch 'gg/flash-attn' of https://github.com/ggerganov/llama.cpp into flash-attn-cuda

This commit is contained in:
FSSRepo 2024-01-29 13:17:39 -05:00
commit 7980178a17
94 changed files with 90340 additions and 2283 deletions

View file

@ -0,0 +1,26 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG UBUNTU_VERSION=22.04
FROM intel/hpckit:$ONEAPI_VERSION as build
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \
cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
cmake --build . --config Release --target main server
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/build/bin/main /main
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View file

@ -7,6 +7,18 @@
{ system, ... }: { system, ... }:
{ {
_module.args = { _module.args = {
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
# again, the below creates several nixpkgs instances which the
# flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
#
# This is currently "slow" and "expensive", on a certain scale.
# This also isn't "right" in that this hinders dependency injection at
# the level of flake inputs. This might get removed in the foreseeable
# future.
#
# Note that you can use these expressions without Nix
# (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
pkgsCuda = import inputs.nixpkgs { pkgsCuda = import inputs.nixpkgs {
inherit system; inherit system;
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc, # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,

View file

@ -73,6 +73,7 @@ let
ps: [ ps: [
ps.numpy ps.numpy
ps.sentencepiece ps.sentencepiece
ps.tiktoken
ps.torchWithoutCuda ps.torchWithoutCuda
ps.transformers ps.transformers
] ]
@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
pname = "llama-cpp${pnameSuffix}"; pname = "llama-cpp${pnameSuffix}";
version = llamaVersion; version = llamaVersion;
# Note: none of the files discarded here are visible in the sandbox or
# affect the output hash. This also means they can be modified without
# triggering a rebuild.
src = lib.cleanSourceWith { src = lib.cleanSourceWith {
filter = filter =
name: type: name: type:
!(builtins.any (_: _) [ let
noneOf = builtins.all (x: !x);
baseName = baseNameOf name;
in
noneOf [
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
(name == "README.md") # Ignore *.md changes whe computing outPaths (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
(lib.hasPrefix "." name) # Skip hidden files and directories (lib.hasPrefix "." baseName) # Skip hidden files and directories
]); (baseName == "flake.lock")
];
src = lib.cleanSource ../../.; src = lib.cleanSource ../../.;
}; };
@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (
cmakeFlags = cmakeFlags =
[ [
(cmakeBool "LLAMA_NATIVE" true) (cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" true) (cmakeBool "BUILD_SHARED_LIBS" true)
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation (
description = "contains numpy and sentencepiece"; description = "contains numpy and sentencepiece";
buildInputs = [ llama-python ]; buildInputs = [ llama-python ];
inputsFrom = [ finalAttrs.finalPackage ]; inputsFrom = [ finalAttrs.finalPackage ];
shellHook = ''
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
'';
}; };
shell-extra = mkShell { shell-extra = mkShell {

View file

@ -4,6 +4,10 @@
llamaVersion ? "0.0.0", llamaVersion ? "0.0.0",
}: }:
# We're using `makeScope` instead of just writing out an attrset
# because it allows users to apply overlays later using `overrideScope'`.
# Cf. https://noogle.dev/f/lib/makeScope
lib.makeScope newScope ( lib.makeScope newScope (
self: { self: {
inherit llamaVersion; inherit llamaVersion;

View file

@ -0,0 +1,32 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
COPY --from=build /app/server /server
ENTRYPOINT [ "/server" ]

View file

@ -0,0 +1,25 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG UBUNTU_VERSION=22.04
FROM intel/hpckit:$ONEAPI_VERSION as build
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \
cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
cmake --build . --config Release --target main server
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View file

@ -0,0 +1,45 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT [ "/app/server" ]

20
.devops/server.Dockerfile Normal file
View file

@ -0,0 +1,20 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View file

@ -72,7 +72,7 @@ jobs:
id: cmake_test id: cmake_test
run: | run: |
cd build cd build
ctest --verbose --timeout 900 ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-sanitizer: ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -107,7 +107,7 @@ jobs:
id: cmake_test id: cmake_test
run: | run: |
cd build cd build
ctest --verbose --timeout 900 ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-mpi: ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -141,7 +141,48 @@ jobs:
id: cmake_test id: cmake_test
run: | run: |
cd build cd build
ctest --verbose ctest -L main --verbose
ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v2
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake --build . --config Release -j $(nproc)
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it. # how to debug it.
@ -202,7 +243,7 @@ jobs:
id: cmake_test id: cmake_test
run: | run: |
cd build cd build
ctest --verbose --timeout 900 ctest -L main --verbose --timeout 900
macOS-latest-cmake-ios: macOS-latest-cmake-ios:
runs-on: macos-latest runs-on: macos-latest
@ -295,7 +336,7 @@ jobs:
OPENBLAS_VERSION: 0.3.23 OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17 OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0 CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.21.1-2023-04-24 SDE_VERSION: 9.33.0-2024-01-07
strategy: strategy:
matrix: matrix:
@ -394,19 +435,19 @@ jobs:
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512 if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
run: | run: |
cd build cd build
ctest -C Release --verbose --timeout 900 ctest -L main -C Release --verbose --timeout 900
- name: Test (Intel SDE) - name: Test (Intel SDE)
id: cmake_test_sde id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: | run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz" curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz # for some weird reason windows tar doesn't like sde tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build cd build
& $sde -future -- ctest -C Release --verbose --timeout 900 & $sde -future -- ctest -L main -C Release --verbose --timeout 900
- name: Determine tag name - name: Determine tag name
id: tag id: tag

View file

@ -28,13 +28,18 @@ jobs:
config: config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why # have disabled them for now until the reason why
# is understood. # is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v3 uses: actions/checkout@v3

View file

@ -2,13 +2,20 @@ name: Nix aarch64 builds
on: on:
workflow_dispatch: # allows manual triggering workflow_dispatch: # allows manual triggering
schedule:
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
# 1.5h instead of minutes with the cold cache).
#
# randint(0, 59), randint(0, 23)
- cron: '26 12 * * *'
# But also rebuild if we touched any of the Nix expressions:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] paths: ['**/*.nix', 'flake.lock']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] paths: ['**/*.nix', 'flake.lock']
jobs: jobs:
nix-build-aarch64: nix-build-aarch64:

View file

@ -5,10 +5,8 @@ on:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
jobs: jobs:
nix-eval: nix-eval:

19
.gitignore vendored
View file

@ -27,7 +27,7 @@
lcov-report/ lcov-report/
gcovr-report/ gcovr-report/
build*/ build*
out/ out/
tmp/ tmp/
@ -89,20 +89,3 @@ examples/jeopardy/results.txt
poetry.lock poetry.lock
poetry.toml poetry.toml
# Test binaries
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-double-float
/tests/test-grad0
/tests/test-opt
/tests/test-quantize-fns
/tests/test-quantize-perf
/tests/test-sampling
/tests/test-tokenizer-0-llama
/tests/test-tokenizer-0-falcon
/tests/test-tokenizer-1-llama
/tests/test-tokenizer-1-bpe
/tests/test-rope
/tests/test-backend-ops
/tests/test-autorelease

View file

@ -1,5 +1,6 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX) project("llama.cpp" C CXX)
include(CheckIncludeFileCXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -47,6 +48,7 @@ option(BUILD_SHARED_LIBS "build shared libraries"
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" ON) option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
option(LLAMA_LTO "llama: enable link time optimization" OFF) option(LLAMA_LTO "llama: enable link time optimization" OFF)
option(LLAMA_CCACHE "llama: use ccache if available" ON)
# debug # debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
@ -97,24 +99,38 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON) option(LLAMA_BUILD_SERVER "llama: build server example" ON)
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
if (LLAMA_PERF)
add_definitions(-DGGML_PERF)
endif()
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
# #
# Compile flags # Compile flags
# #
if (LLAMA_SYCL)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD_REQUIRED true)
@ -401,6 +417,22 @@ if (LLAMA_CLBLAST)
endif() endif()
endif() endif()
if (LLAMA_VULKAN)
find_package(Vulkan)
if (Vulkan_FOUND)
message(STATUS "Vulkan found")
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
add_compile_definitions(GGML_USE_VULKAN)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
else()
message(WARNING "Vulkan not found")
endif()
endif()
if (LLAMA_HIPBLAS) if (LLAMA_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm) list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
@ -446,6 +478,32 @@ if (LLAMA_HIPBLAS)
endif() endif()
endif() endif()
if (LLAMA_SYCL)
if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
endif()
#todo: AOT
find_package(IntelSYCL REQUIRED)
if (LLAMA_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16)
endif()
add_compile_definitions(GGML_USE_SYCL)
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
endif()
function(get_flags CCID CCVER) function(get_flags CCID CCVER)
set(C_FLAGS "") set(C_FLAGS "")
set(CXX_FLAGS "") set(CXX_FLAGS "")
@ -458,17 +516,24 @@ function(get_flags CCID CCVER)
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
) )
set(C_FLAGS ${C_FLAGS} -Wdouble-promotion) list(APPEND C_FLAGS -Wdouble-promotion)
endif() endif()
elseif (CCID STREQUAL "GNU") elseif (CCID STREQUAL "GNU")
set(C_FLAGS -Wdouble-promotion) set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds) set(CXX_FLAGS -Wno-array-bounds)
if (CCVER VERSION_GREATER_EQUAL 7.1.0) if (CCVER VERSION_GREATER_EQUAL 7.1.0)
set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation) list(APPEND CXX_FLAGS -Wno-format-truncation)
endif() endif()
if (CCVER VERSION_GREATER_EQUAL 8.1.0) if (CCVER VERSION_GREATER_EQUAL 8.1.0)
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi) list(APPEND CXX_FLAGS -Wextra-semi)
endif()
elseif (CCID MATCHES "Intel")
if (NOT LLAMA_SYCL)
# enable max optimization level when using Intel compiler
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
add_link_options(-fuse-ld=lld -static-intel)
endif() endif()
endif() endif()
@ -497,16 +562,18 @@ if (LLAMA_ALL_WARNINGS)
endif() endif()
endif() endif()
set(CUDA_CXX_FLAGS "")
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math) set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
if (NOT MSVC) if (NOT MSVC)
set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic) list(APPEND CUDA_FLAGS -Wno-pedantic)
endif() endif()
if (LLAMA_ALL_WARNINGS AND NOT MSVC) if (LLAMA_ALL_WARNINGS AND NOT MSVC)
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER}) list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
endif() endif()
execute_process( execute_process(
@ -534,15 +601,10 @@ if (LLAMA_CUBLAS)
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
get_flags(${CUDA_CCID} ${CUDA_CCVER}) get_flags(${CUDA_CCID} ${CUDA_CCVER})
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
if (NOT CUDA_CXX_FLAGS STREQUAL "")
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
endif() endif()
endif() endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
endif()
if (WIN32) if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@ -561,6 +623,17 @@ if (LLAMA_LTO)
endif() endif()
endif() endif()
if (LLAMA_CCACHE)
find_program(LLAMA_CCACHE_FOUND ccache)
if (LLAMA_CCACHE_FOUND)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
set(ENV{CCACHE_SLOPPINESS} time_macros)
message(STATUS "Using ccache")
else()
message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
endif ()
endif()
# this version of Apple ld64 is buggy # this version of Apple ld64 is buggy
execute_process( execute_process(
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
@ -594,12 +667,7 @@ if (NOT MSVC)
endif() endif()
endif() endif()
function(add_compile_option_cpp ARG) set(ARCH_FLAGS "")
# Adds a compile option to C/C++ only, but not for Cuda.
# Use, e.g., for CPU-architecture flags.
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
endfunction()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected") message(STATUS "ARM detected")
@ -612,19 +680,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
else() else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
add_compile_options(-mfp16-format=ieee) list(APPEND ARCH_FLAGS -mfp16-format=ieee)
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero # Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access) list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2 # Raspberry Pi 2
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit) # Raspberry Pi 3, 4, Zero 2 (32-bit)
add_compile_options(-mno-unaligned-access) list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif() endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
@ -635,7 +703,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
include(cmake/FindSIMD.cmake) include(cmake/FindSIMD.cmake)
endif () endif ()
if (LLAMA_AVX512) if (LLAMA_AVX512)
add_compile_option_cpp(/arch:AVX512) list(APPEND ARCH_FLAGS /arch:AVX512)
# MSVC has no compile-time flags enabling specific # MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the # AVX512 extensions, neither it defines the
# macros corresponding to the extensions. # macros corresponding to the extensions.
@ -649,49 +717,61 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>) add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif() endif()
elseif (LLAMA_AVX2) elseif (LLAMA_AVX2)
add_compile_option_cpp(/arch:AVX2) list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (LLAMA_AVX) elseif (LLAMA_AVX)
add_compile_option_cpp(/arch:AVX) list(APPEND ARCH_FLAGS /arch:AVX)
endif() endif()
else() else()
if (LLAMA_NATIVE) if (LLAMA_NATIVE)
add_compile_option_cpp(-march=native) list(APPEND ARCH_FLAGS -march=native)
endif() endif()
if (LLAMA_F16C) if (LLAMA_F16C)
add_compile_option_cpp(-mf16c) list(APPEND ARCH_FLAGS -mf16c)
endif() endif()
if (LLAMA_FMA) if (LLAMA_FMA)
add_compile_option_cpp(-mfma) list(APPEND ARCH_FLAGS -mfma)
endif() endif()
if (LLAMA_AVX) if (LLAMA_AVX)
add_compile_option_cpp(-mavx) list(APPEND ARCH_FLAGS -mavx)
endif() endif()
if (LLAMA_AVX2) if (LLAMA_AVX2)
add_compile_option_cpp(-mavx2) list(APPEND ARCH_FLAGS -mavx2)
endif() endif()
if (LLAMA_AVX512) if (LLAMA_AVX512)
add_compile_option_cpp(-mavx512f) list(APPEND ARCH_FLAGS -mavx512f)
add_compile_option_cpp(-mavx512bw) list(APPEND ARCH_FLAGS -mavx512bw)
endif() endif()
if (LLAMA_AVX512_VBMI) if (LLAMA_AVX512_VBMI)
add_compile_option_cpp(-mavx512vbmi) list(APPEND ARCH_FLAGS -mavx512vbmi)
endif() endif()
if (LLAMA_AVX512_VNNI) if (LLAMA_AVX512_VNNI)
add_compile_option_cpp(-mavx512vnni) list(APPEND ARCH_FLAGS -mavx512vnni)
endif() endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
add_compile_options(-mcpu=powerpc64le) list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
else() else()
add_compile_options(-mcpu=native -mtune=native) list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif() endif()
else() else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
if (LLAMA_CUBLAS)
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
endif()
if (MINGW) if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory # Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER}) add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
@ -771,6 +851,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
) )
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})

View file

@ -9,7 +9,7 @@ TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
tests/test-backend-ops tests/test-autorelease tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
# Code coverage output files # Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -448,6 +448,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_CLBLAST endif # LLAMA_CLBLAST
ifdef LLAMA_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
OBJS += ggml-vulkan.o
ifdef LLAMA_VULKAN_CHECK_RESULTS
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
endif
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_VULKAN
ifdef LLAMA_HIPBLAS ifdef LLAMA_HIPBLAS
ifeq ($(wildcard /opt/rocm),) ifeq ($(wildcard /opt/rocm),)
@ -619,7 +632,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
@ -748,5 +761,8 @@ tests/test-c.o: tests/test-c.c llama.h
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

View file

@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow - New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
- Collecting Apple Silicon performance stats: - Collecting Apple Silicon performance stats:
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167 - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508 - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
---- ----
@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
- AVX, AVX2 and AVX512 support for x86 architectures - AVX, AVX2 and AVX512 support for x86 architectures
- Mixed F16 / F32 precision - Mixed F16 / F32 precision
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
- CUDA, Metal and OpenCL GPU backend support - CUDA, Metal, OpenCL, SYCL GPU backend support
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava) - [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
**Bindings:** **Bindings:**
@ -121,13 +122,15 @@ as the main playground for developing new features for the [ggml](https://github
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
**UI:** **UI:**
@ -596,6 +599,15 @@ Building the program with BLAS support may lead to some performance improvements
You can get a list of platforms and devices from the `clinfo -l` command, etc. You can get a list of platforms and devices from the `clinfo -l` command, etc.
- #### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
### Prepare Data & Run ### Prepare Data & Run
```bash ```bash
@ -929,17 +941,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
* Create a folder to store big models & intermediate files (ex. /llama/models) * Create a folder to store big models & intermediate files (ex. /llama/models)
#### Images #### Images
We have two Docker images available for this project: We have three Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above: Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
@ -965,6 +980,12 @@ or with a light image:
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
``` ```
or with a server image:
```bash
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
```
### Docker With CUDA ### Docker With CUDA
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@ -974,6 +995,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
```bash ```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
``` ```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -987,6 +1009,7 @@ The resulting images, are essentially the same as the non-CUDA images:
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
#### Usage #### Usage
@ -995,6 +1018,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
```bash ```bash
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
``` ```
### Contributing ### Contributing

252
README_sycl.md Normal file
View file

@ -0,0 +1,252 @@
# llama.cpp for SYCL
[Background](#background)
[OS](#os)
[Intel GPU](#intel-gpu)
[Linux](#linux)
[Environment Variable](#environment-variable)
[Known Issue](#known-issue)
[Todo](#todo)
## Background
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
The llama.cpp for SYCL is used to support Intel GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## OS
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04|
|Windows|Ongoing| |
## Intel GPU
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
## Linux
### Setup Environment
1. Install Intel GPU driver.
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
Note: for iGPU, please install the client GPU driver.
b. Add user to group: video, render.
```
sudo usermod -aG render username
sudo usermod -aG video username
```
Note: re-login to enable it.
c. Check
```
sudo apt install clinfo
sudo clinfo -l
```
Output (example):
```
Platform #0: Intel(R) OpenCL Graphics
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
2. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Check
```
source /opt/intel/oneapi/setvars.sh
sycl-ls
```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
```
2. Build locally:
```
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
#for FP32
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
#cmake --build . --config Release --target main
#build all binary
cmake --build . --config Release -v
```
or
```
./examples/sycl/build.sh
```
Note:
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### Run
1. Put model file to folder **models**
2. Enable oneAPI running environment
```
source /opt/intel/oneapi/setvars.sh
```
3. List device ID
Run without parameter:
```
./build/bin/ls-sycl-device
or
./build/bin/main
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Set device ID and execute llama.cpp
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
```
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
or run by script:
```
./examples/sycl/run_llama2.sh
```
Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
5. Check the device ID in output
Like
```
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Environment Variable
#### Build
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
#### Running
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
## Known Issue
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- Hang during startup
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
Solution: add **--no-mmap**.
## Todo
- Support to build in Windows.
- Support multiple cards.

View file

@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with CUDA support # with CUDA support
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with SYCL support
source /opt/intel/oneapi/setvars.sh
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
``` ```

View file

@ -10,6 +10,9 @@
# # with CUDA support # # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# #
# # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>" echo "usage: $0 <output-dir> <mnt-dir>"
@ -22,9 +25,9 @@ mkdir -p "$2"
OUT=$(realpath "$1") OUT=$(realpath "$1")
MNT=$(realpath "$2") MNT=$(realpath "$2")
rm -v $OUT/*.log rm -f "$OUT/*.log"
rm -v $OUT/*.exit rm -f "$OUT/*.exit"
rm -v $OUT/*.md rm -f "$OUT/*.md"
sd=`dirname $0` sd=`dirname $0`
cd $sd/../ cd $sd/../
@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
fi fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
fi
## helpers ## helpers
# download a file if it does not exist or if it is outdated # download a file if it does not exist or if it is outdated
@ -94,7 +105,7 @@ function gg_run_ctest_debug {
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e set +e
} }
@ -123,9 +134,9 @@ function gg_run_ctest_release {
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
else else
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi fi
set +e set +e
@ -141,6 +152,61 @@ function gg_sum_ctest_release {
gg_printf '```\n' gg_printf '```\n'
} }
function gg_get_model {
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
if [[ -s $gguf_3b ]]; then
echo -n "$gguf_3b"
elif [[ -s $gguf_7b ]]; then
echo -n "$gguf_7b"
else
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
exit 1
fi
}
function gg_run_ctest_with_model_debug {
cd ${SRC}
local model; model=$(gg_get_model)
cd build-ci-debug
set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
cd ..
}
function gg_run_ctest_with_model_release {
cd ${SRC}
local model; model=$(gg_get_model)
cd build-ci-release
set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
cd ..
}
function gg_sum_ctest_with_model_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest with model files in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
}
function gg_sum_ctest_with_model_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest with model files in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
}
# open_llama_3b_v2 # open_llama_3b_v2
function gg_run_open_llama_3b_v2 { function gg_run_open_llama_3b_v2 {
@ -183,8 +249,6 @@ function gg_run_open_llama_3b_v2 {
wiki_test_60="${path_wiki}/wiki.test-60.raw" wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/test-autorelease ${model_f16}
./bin/quantize ${model_f16} ${model_q8_0} q8_0 ./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0 ./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1 ./bin/quantize ${model_f16} ${model_q4_1} q4_1
@ -507,14 +571,18 @@ function gg_sum_open_llama_7b_v2 {
## main ## main
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
rm -rf ${SRC}/models-mnt rm -rf ${SRC}/models-mnt
mnt_models=${MNT}/models mnt_models=${MNT}/models
mkdir -p ${mnt_models} mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt ln -sfn ${mnt_models} ${SRC}/models-mnt
python3 -m pip install -r ${SRC}/requirements.txt # Create a fresh python3 venv and enter it
python3 -m pip install --editable gguf-py python3 -m venv "$MNT/venv"
source "$MNT/venv/bin/activate"
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
pip install --editable gguf-py --disable-pip-version-check
fi fi
ret=0 ret=0
@ -529,6 +597,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
else else
test $ret -eq 0 && gg_run open_llama_7b_v2 test $ret -eq 0 && gg_run open_llama_7b_v2
fi fi
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release
fi fi
fi fi

View file

@ -42,6 +42,10 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
#define GGML_USE_CUBLAS_SYCL
#endif
int32_t get_num_physical_cores() { int32_t get_num_physical_cores() {
#ifdef __linux__ #ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores // enumerate the set of thread siblings, num entries is num cores
@ -203,6 +207,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.prompt_cache_all = true; params.prompt_cache_all = true;
} else if (arg == "--prompt-cache-ro") { } else if (arg == "--prompt-cache-ro") {
params.prompt_cache_ro = true; params.prompt_cache_ro = true;
} else if (arg == "-bf" || arg == "--binary-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i], std::ios::binary);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
// store the external file name in params
params.prompt_file = argv[i];
std::ostringstream ss;
ss << file.rdbuf();
params.prompt = ss.str();
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
} else if (arg == "-f" || arg == "--file") { } else if (arg == "-f" || arg == "--file") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -582,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--split-mode" || arg == "-sm") { } else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -601,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--tensor-split" || arg == "-ts") { } else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -626,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f; params.tensor_split[i] = 0.0f;
} }
} }
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUBLAS_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--no-mmap") { } else if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--numa") { } else if (arg == "--numa") {
@ -653,6 +675,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
if (params.logdir.back() != DIRECTORY_SEPARATOR) { if (params.logdir.back() != DIRECTORY_SEPARATOR) {
params.logdir += DIRECTORY_SEPARATOR; params.logdir += DIRECTORY_SEPARATOR;
} }
} else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.logits_file = argv[i];
} else if (arg == "--perplexity" || arg == "--all-logits") { } else if (arg == "--perplexity" || arg == "--all-logits") {
params.logits_all = true; params.logits_all = true;
} else if (arg == "--ppl-stride") { } else if (arg == "--ppl-stride") {
@ -689,6 +717,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.winogrande_tasks = std::stoi(argv[i]); params.winogrande_tasks = std::stoi(argv[i]);
} else if (arg == "--multiple-choice") {
params.multiple_choice = true;
} else if (arg == "--multiple-choice-tasks") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.multiple_choice_tasks = std::stoi(argv[i]);
} else if (arg == "--kl-divergence") {
params.kl_divergence = true;
} else if (arg == "--ignore-eos") { } else if (arg == "--ignore-eos") {
params.ignore_eos = true; params.ignore_eos = true;
} else if (arg == "--no-penalize-nl") { } else if (arg == "--no-penalize-nl") {
@ -888,6 +926,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
printf(" -f FNAME, --file FNAME\n"); printf(" -f FNAME, --file FNAME\n");
printf(" prompt file to start generation.\n"); printf(" prompt file to start generation.\n");
printf(" -bf FNAME, --binary-file FNAME\n");
printf(" binary file containing multiple choice tasks.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@ -936,6 +976,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base");
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
@ -969,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
#endif #endif // LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n"); printf(" -gan N, --grp-attn-n N\n");
@ -1476,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");

View file

@ -91,6 +91,7 @@ struct gpt_params {
std::string input_suffix = ""; // string to suffix user inputs with std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files std::string logdir = ""; // directory in which to save YAML log files
std::string logits_file = ""; // file for saving *all* logits
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
@ -108,6 +109,11 @@ struct gpt_params {
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL-divergence
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool random_prompt = false; // do not randomize prompt if none provided bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs

View file

@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
// will be empty (default) if there are parse errors // will be empty (default) if there are parse errors
if (result->parsed_grammar.rules.empty()) { if (result->parsed_grammar.rules.empty()) {
fprintf(stderr, "%s: failed to parse grammar\n", __func__); fprintf(stderr, "%s: failed to parse grammar\n", __func__);
delete result;
return nullptr; return nullptr;
} }
@ -129,6 +130,8 @@ static void sampler_queue(
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp; const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p; const float top_p = params.top_p;
const float min_p = params.min_p; const float min_p = params.min_p;
@ -143,7 +146,15 @@ static void sampler_queue(
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; case 't':
if (dynatemp_range > 0) {
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
} else {
llama_sample_temp(ctx_main, &cur_p, temp);
}
break;
default : break; default : break;
} }
} }

View file

@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
float tfs_z = 1.00f; // 1.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.10f; // 1.0 = disabled float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled

View file

@ -10,7 +10,7 @@ import re
import sys import sys
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
import numpy as np import numpy as np
import torch import torch
@ -201,6 +201,8 @@ class Model:
return PlamoModel return PlamoModel
if model_architecture == "CodeShellForCausalLM": if model_architecture == "CodeShellForCausalLM":
return CodeShellModel return CodeShellModel
if model_architecture == "OrionForCausalLM":
return OrionModel
return Model return Model
def _is_model_safetensors(self) -> bool: def _is_model_safetensors(self) -> bool:
@ -250,6 +252,8 @@ class Model:
return gguf.MODEL_ARCH.PLAMO return gguf.MODEL_ARCH.PLAMO
if arch == "CodeShellForCausalLM": if arch == "CodeShellForCausalLM":
return gguf.MODEL_ARCH.CODESHELL return gguf.MODEL_ARCH.CODESHELL
if arch == "OrionForCausalLM":
return gguf.MODEL_ARCH.ORION
raise NotImplementedError(f'Architecture "{arch}" not supported!') raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -289,6 +293,58 @@ class Model:
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_qwen(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
assert len(merged) == 2
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.CONTROL)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self): def _set_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
@ -487,6 +543,7 @@ class MPTModel(Model):
# map tensor names # map tensor names
if "scales" in name: if "scales" in name:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
if new_name is not None:
new_name = new_name.replace("scales", "act.scales") new_name = new_name.replace("scales", "act.scales")
else: else:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
@ -519,6 +576,83 @@ class MPTModel(Model):
self.gguf_writer.add_tensor("output.weight", data) self.gguf_writer.add_tensor("output.weight", data)
class OrionModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
head_count = self.hparams["num_attention_heads"]
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
hf_repo = self.hparams.get("_name_or_path", "")
ctx_length = 0
if "max_sequence_length" in self.hparams:
ctx_length = self.hparams["max_sequence_length"]
elif "max_position_embeddings" in self.hparams:
ctx_length = self.hparams["max_position_embeddings"]
elif "model_max_length" in self.hparams:
ctx_length = self.hparams["model_max_length"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
def write_tensors(self):
# Collect tensors from generator object
model_kv = dict(self.get_tensors())
block_count = self.hparams["num_hidden_layers"]
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in model_kv.items():
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
class BaichuanModel(Model): class BaichuanModel(Model):
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
@ -876,6 +1010,13 @@ class PersimmonModel(Model):
class StableLMModel(Model): class StableLMModel(Model):
def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
else:
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
self._set_vocab_qwen()
def set_gguf_parameters(self): def set_gguf_parameters(self):
hparams = self.hparams hparams = self.hparams
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
@ -904,7 +1045,7 @@ class QwenModel(Model):
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@staticmethod @staticmethod
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]: def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
parts = [bytes([b]) for b in token] parts = [bytes([b]) for b in token]
while True: while True:
min_idx = None min_idx = None
@ -921,52 +1062,7 @@ class QwenModel(Model):
return parts return parts
def set_vocab(self): def set_vocab(self):
dir_model = self.dir_model self._set_vocab_qwen()
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[self.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
assert len(merged) == 2
merges.append(' '.join(map(self.token_bytes_to_string, merged)))
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
added_vocab = tokenizer.special_tokens
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.CONTROL)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.merges = merges
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name("Qwen") self.gguf_writer.add_name("Qwen")
@ -1285,7 +1381,7 @@ def main() -> None:
if args.awq_path: if args.awq_path:
sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
tmp_model_path = args.model / "weighted_model" tmp_model_path = args.model / "weighted_model"
dir_model = tmp_model_path dir_model = tmp_model_path
if tmp_model_path.is_dir(): if tmp_model_path.is_dir():

View file

@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import os
import struct import struct
import sys import sys
from enum import IntEnum from enum import IntEnum
@ -9,7 +10,6 @@ from pathlib import Path
import numpy as np import numpy as np
import os
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else: else:
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab = convert.load_vocab( vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, vocab_factory = convert.VocabFactory(vocab_path)
cfg.vocabtype) vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
load_merges = cfg.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return (params, vocab, svocab) return params, vocab, special_vocab
def handle_args(): def handle_args():

View file

@ -5,17 +5,16 @@ import json
import os import os
import struct import struct
import sys import sys
from pathlib import Path
from typing import Any, BinaryIO, Sequence from typing import Any, BinaryIO, Sequence
import numpy as np import numpy as np
import torch import torch
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf import gguf
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
@ -60,7 +59,14 @@ if __name__ == '__main__':
input_model = os.path.join(sys.argv[1], "adapter_model.bin") input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
if os.path.exists(input_model):
model = torch.load(input_model, map_location="cpu") model = torch.load(input_model, map_location="cpu")
else:
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file
model = load_file(input_model, device="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values(): if arch_name not in gguf.MODEL_ARCH_NAMES.values():

View file

@ -1,11 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import torch
import os
from pprint import pprint
import sys
import argparse import argparse
import os
import sys
from pathlib import Path from pathlib import Path
from pprint import pprint
import torch
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
@ -69,7 +71,7 @@ def main():
persimmon_model = torch.load(args.ckpt_path) persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args'] hparams = persimmon_model['args']
pprint(hparams) pprint(hparams)
tensors = {} tensors: dict[str, torch.Tensor] = {}
_flatten_dict(persimmon_model['model'], tensors, None) _flatten_dict(persimmon_model['model'], tensors, None)
arch = gguf.MODEL_ARCH.PERSIMMON arch = gguf.MODEL_ARCH.PERSIMMON

View file

@ -17,58 +17,28 @@ import signal
import struct import struct
import sys import sys
import time import time
import warnings
import zipfile import zipfile
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from argparse import ArgumentParser
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import ( from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
IO,
TYPE_CHECKING,
Any,
Callable,
Iterable,
Literal,
Optional,
Tuple,
TypeVar,
)
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
try: if 'NO_LOCAL_GGUF' not in os.environ:
from transformers import AutoTokenizer sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
except ModuleNotFoundError as e:
warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
if "NO_LOCAL_GGUF" not in os.environ:
# Use absolute path to the gguf-py directory
gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed
if gguf_py_dir not in sys.path:
sys.path.insert(1, gguf_py_dir)
# Import gguf module
try:
import gguf import gguf
except ModuleNotFoundError as e:
print(f"Could not import gguf: {e}")
sys.exit(1)
if TYPE_CHECKING: # NOTE: This isn't necessary. if TYPE_CHECKING:
from typing import TypeAlias # This can technically be omitted. from typing import TypeAlias
if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"): if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1) faulthandler.register(signal.SIGUSR1)
# NOTE: n-dimensional arrays should be directly referenced NDArray: TypeAlias = 'np.ndarray[Any, Any]'
NDArray: TypeAlias = "np.ndarray[Any, Any]"
# Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
ARCH = gguf.MODEL_ARCH.LLAMA ARCH = gguf.MODEL_ARCH.LLAMA
DEFAULT_CONCURRENCY = 8 DEFAULT_CONCURRENCY = 8
@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
# #
# TODO: Clean up and refactor data types
@dataclass(frozen=True) @dataclass(frozen=True)
class DataType: class DataType:
name: str name: str
@ -190,57 +159,37 @@ class Params:
n_ff: int n_ff: int
n_head: int n_head: int
n_head_kv: int n_head_kv: int
f_norm_eps: Optional[float] = None n_experts: int | None = None
n_experts: Optional[int] = None n_experts_used: int | None = None
n_experts_used: Optional[int] = None f_norm_eps: float | None = None
rope_scaling_type: Optional[gguf.RopeScalingType] = None rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: Optional[float] = None f_rope_freq_base: float | None = None
f_rope_scale: Optional[float] = None f_rope_scale: float | None = None
n_orig_ctx: Optional[int] = None n_orig_ctx: int | None = None
rope_finetuned: Optional[bool] = None rope_finetuned: bool | None = None
ftype: Optional[GGMLFileType] = None ftype: GGMLFileType | None = None
# path to the directory containing the model files # path to the directory containing the model files
path_model: Optional[Path] = None path_model: Path | None = None
@staticmethod @staticmethod
def guessed(model: LazyModel) -> "Params": def guessed(model: LazyModel) -> Params:
# try transformer naming first # try transformer naming first
n_vocab, n_embd = ( n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
model["model.embed_tokens.weight"].shape
if "model.embed_tokens.weight" in model
else model["tok_embeddings.weight"].shape
)
# try transformer naming first # try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model: if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer = next( n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
i elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
for i in itertools.count() n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
if f"model.layers.{i}.self_attn.q_proj.weight" not in model
)
elif (
"model.layers.0.self_attn.W_pack.weight" in model
): # next: try baichuan naming
n_layer = next(
i
for i in itertools.count()
if f"model.layers.{i}.self_attn.W_pack.weight" not in model
)
else: else:
n_layer = next( n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
i
for i in itertools.count()
if f"layers.{i}.attention.wq.weight" not in model
)
if n_layer < 1: if n_layer < 1:
raise Exception( raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"failed to guess 'n_layer'. This model is unknown or unsupported.\n" "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
)
n_head = n_embd // 128 # guessed n_head = n_embd // 128 # guessed
n_mult = 256 # guessed n_mult = 256 # guessed
@ -261,7 +210,7 @@ class Params:
) )
@staticmethod @staticmethod
def load_transformers_config(model: LazyModel, config_path: Path) -> "Params": def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@ -274,20 +223,18 @@ class Params:
rope_scaling_type = gguf.RopeScalingType.LINEAR rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn": elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling["original_max_position_embeddings"] n_orig_ctx = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling["finetuned"] rope_finetuned = rope_scaling['finetuned']
else: else:
raise NotImplementedError(f"Unknown rope scaling type: {typ}") raise NotImplementedError(f'Unknown rope scaling type: {typ}')
if "max_sequence_length" in config: if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"] n_ctx = config["max_sequence_length"]
elif "max_position_embeddings" in config: elif "max_position_embeddings" in config:
n_ctx = config["max_position_embeddings"] n_ctx = config["max_position_embeddings"]
else: else:
raise Exception( raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"failed to guess 'n_ctx'. This model is unknown or unsupported.\n" "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
)
n_experts = None n_experts = None
n_experts_used = None n_experts_used = None
@ -317,7 +264,7 @@ class Params:
# LLaMA v2 70B params.json # LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod @staticmethod
def load_torch_params(model: LazyModel, config_path: Path) -> "Params": def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
n_experts = None n_experts = None
@ -362,31 +309,31 @@ class Params:
) )
@staticmethod @staticmethod
def load(model_plus: ModelPlus) -> "Params": def load(model_plus: ModelPlus) -> Params:
hf_config_path = model_plus.paths[0].parent / "config.json" hf_config_path = model_plus.paths[0].parent / "config.json"
orig_config_path = model_plus.paths[0].parent / "params.json" orig_config_path = model_plus.paths[0].parent / "params.json"
if hf_config_path.exists(): if hf_config_path.exists():
params = Params.load_transformers_config(model_plus.model, hf_config_path) params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
elif orig_config_path.exists(): elif orig_config_path.exists():
params = Params.load_torch_params(model_plus.model, orig_config_path) params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
elif model_plus.format != "none": elif model_plus.format != 'none':
params = Params.guessed(model_plus.model) params = Params.guessed(model_plus.model)
else: else:
raise ValueError("Cannot guess params when model format is none") raise ValueError('Cannot guess params when model format is none')
params.path_model = model_plus.paths[0].parent params.path_model = model_plus.paths[0].parent
return params return params
class BpeVocab: # GPT #
def __init__( # vocab
self, fname_tokenizer: Path, fname_added_tokens: Optional[Path] #
) -> None:
self.bpe_tokenizer = json.loads( class BpeVocab:
open(str(fname_tokenizer), encoding="utf-8").read() def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
) self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
self.vocab = self.bpe_tokenizer["model"]["vocab"] self.vocab = self.bpe_tokenizer["model"]["vocab"]
added_tokens: dict[str, int] added_tokens: dict[str, int]
if fname_added_tokens is not None: if fname_added_tokens is not None:
@ -394,26 +341,23 @@ class BpeVocab: # GPT
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else: else:
# Fall back to trying to find the added tokens in tokenizer.json # Fall back to trying to find the added tokens in tokenizer.json
tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json" tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file(): if not tokenizer_json_file.is_file():
added_tokens = {} added_tokens = {}
else: else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict( added_tokens = dict(
(item["content"], item["id"]) (item['content'], item['id'])
for item in tokenizer_json.get("added_tokens", []) for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary. # Added tokens here can be duplicates of the main vocabulary.
if item["content"] not in self.bpe_tokenizer if item['content'] not in self.bpe_tokenizer)
)
vocab_size: int = len(self.vocab) vocab_size: int = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values()) actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids: if expected_ids != actual_ids:
expected_end_id = vocab_size + len(actual_ids) - 1 expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception( raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
)
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_dict = added_tokens self.added_tokens_dict = added_tokens
@ -442,10 +386,8 @@ class BpeVocab: # GPT
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab: # LlaMa class SentencePieceVocab:
def __init__( def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int] added_tokens: dict[str, int]
if fname_added_tokens is not None: if fname_added_tokens is not None:
@ -455,16 +397,12 @@ class SentencePieceVocab: # LlaMa
vocab_size: int = self.sentencepiece_tokenizer.vocab_size() vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
new_tokens = { new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
id: piece for piece, id in added_tokens.items() if id >= vocab_size
}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys()) actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids: if expected_new_ids != actual_new_ids:
raise ValueError( raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
)
# Token pieces that were added to the base vocabulary. # Token pieces that were added to the base vocabulary.
self.added_tokens_dict = added_tokens self.added_tokens_dict = added_tokens
@ -512,11 +450,15 @@ class SentencePieceVocab: # LlaMa
class HfVocab: class HfVocab:
def __init__( def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
self, try:
fname_tokenizer: Path, from transformers import AutoTokenizer
fname_added_tokens: Optional[Path] = None, except ImportError as e:
) -> None: raise ImportError(
"To use HfVocab, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
print("fname_tokenizer:", fname_tokenizer) print("fname_tokenizer:", fname_tokenizer)
# Allow the tokenizer to default to slow or fast versions. # Allow the tokenizer to default to slow or fast versions.
# Explicitly set tokenizer to use local paths. # Explicitly set tokenizer to use local paths.
@ -555,7 +497,7 @@ class HfVocab:
self.fname_tokenizer = fname_tokenizer self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens self.fname_added_tokens = fname_added_tokens
def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = { reverse_vocab = {
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
} }
@ -573,11 +515,9 @@ class HfVocab:
token_id, self.special_ids # Reuse already stored special IDs token_id, self.special_ids # Reuse already stored special IDs
) )
def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType: def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
# Determine token type based on whether it's a special token # Determine token type based on whether it's a special token
return ( return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
)
def get_token_score(self, token_id: int) -> float: def get_token_score(self, token_id: int) -> float:
# Placeholder for actual logic to determine the token's score # Placeholder for actual logic to determine the token's score
@ -589,7 +529,6 @@ class HfVocab:
if text in self.specials: if text in self.specials:
toktype = self.get_token_type(self.specials[text], self.special_ids) toktype = self.get_token_type(self.specials[text], self.special_ids)
score = self.get_token_score(self.specials[text]) score = self.get_token_score(self.specials[text])
else: else:
toktype = gguf.TokenType.USER_DEFINED toktype = gguf.TokenType.USER_DEFINED
score = -1000.0 score = -1000.0
@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
else: else:
model = merge_sharded([mp.model for mp in models_plus]) model = merge_sharded([mp.model for mp in models_plus])
return ModelPlus(model, paths, format, vocab) return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler):
CLASSES: dict[tuple[str, str], Any] = { CLASSES: dict[tuple[str, str], Any] = {
# getattr used here as a workaround for mypy not being smart enough to determine # getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute. # the staticmethods have a __func__ attribute.
("torch._tensor", "_rebuild_from_type_v2"): getattr( ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
rebuild_from_type_v2, "__func__" ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
), ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
("torch._utils", "_rebuild_tensor_v2"): getattr( ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
lazy_rebuild_tensor_v2, "__func__" ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
), ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16), ('torch', 'Tensor'): LazyTensor,
("torch", "HalfStorage"): LazyStorageKind(DT_F16),
("torch", "FloatStorage"): LazyStorageKind(DT_F32),
("torch", "IntStorage"): LazyStorageKind(DT_I32),
("torch", "Tensor"): LazyTensor,
} }
def find_class(self, module: str, name: str) -> Any: def find_class(self, module: str, name: str) -> Any:
@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
class OutputFile: class OutputFile:
def __init__( def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
) -> None:
self.gguf = gguf.GGUFWriter(
fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
)
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
name = "LLaMA" name = "LLaMA"
@ -1036,7 +967,7 @@ class OutputFile:
if params.n_ctx == 4096: if params.n_ctx == 4096:
name = "LLaMA v2" name = "LLaMA v2"
elif params.path_model is not None: elif params.path_model is not None:
name = str(params.path_model.parent).split("/")[-1] name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name (name) self.gguf.add_name (name)
self.gguf.add_context_length (params.n_ctx) self.gguf.add_context_length (params.n_ctx)
@ -1047,17 +978,17 @@ class OutputFile:
self.gguf.add_head_count (params.n_head) self.gguf.add_head_count (params.n_head)
self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_head_count_kv (params.n_head_kv)
if params.f_norm_eps is None:
raise ValueError("f_norm_eps is None")
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
if params.n_experts: if params.n_experts:
self.gguf.add_expert_count(params.n_experts) self.gguf.add_expert_count(params.n_experts)
if params.n_experts_used: if params.n_experts_used:
self.gguf.add_expert_used_count(params.n_experts_used) self.gguf.add_expert_used_count(params.n_experts_used)
if params.f_norm_eps:
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
else:
raise ValueError('f_norm_eps is None')
if params.f_rope_freq_base is not None: if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base) self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -1089,7 +1020,7 @@ class OutputFile:
return tokenizer_model return tokenizer_model
def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]: def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
@ -1124,14 +1055,10 @@ class OutputFile:
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
n_elements = int(np.prod(tensor.shape)) n_elements = int(np.prod(tensor.shape))
raw_dtype = getattr(tensor.data_type, "ggml_type", None) raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
data_type = ( data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
)
data_nbytes = tensor.data_type.elements_to_bytes(n_elements) data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
self.gguf.add_tensor_info( self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
)
def write_meta(self) -> None: def write_meta(self) -> None:
self.gguf.write_header_to_file() self.gguf.write_header_to_file()
@ -1145,12 +1072,8 @@ class OutputFile:
@staticmethod @staticmethod
def write_vocab_only( def write_vocab_only(
fname_out: Path, fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
params: Params, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
vocab: Vocab,
svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab) check_vocab_size(params, vocab, pad_vocab = pad_vocab)
@ -1180,14 +1103,8 @@ class OutputFile:
@staticmethod @staticmethod
def write_all( def write_all(
fname_out: Path, fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
ftype: GGMLFileType, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
params: Params,
model: LazyModel,
vocab: Vocab,
svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False, pad_vocab: bool = False,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1207,26 +1124,19 @@ class OutputFile:
of.write_tensor_info() of.write_tensor_info()
# tensor data # tensor data
ndarrays_inner = bounded_parallel_map( ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
OutputFile.do_item, model.items(), concurrency=concurrency
)
if ftype == GGMLFileType.MostlyQ8_0: if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map( ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
ndarrays_inner,
concurrency=concurrency,
max_workers=concurrency,
use_processpool_executor=True, use_processpool_executor=True,
) )
else: else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
start = time.time() start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate( for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
zip(model.items(), ndarrays)
):
elapsed = time.time() - start elapsed = time.time() - start
size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape) size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model))) padi = len(str(len(model)))
print( print(
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus:
class VocabFactory: class VocabFactory:
def __init__(self, path: Path): def __init__(self, path: Path):
self.path = path self.path = path
self.files = { self.files: dict[str, Path | None] = {
"tokenizer.model": None, "tokenizer.model": None,
"vocab.json": None, "vocab.json": None,
"tokenizer.json": None, "tokenizer.json": None,
@ -1380,24 +1290,18 @@ class VocabFactory:
self.files[file] = parent_file_path self.files[file] = parent_file_path
print(f"Found vocab files: {self.files}") print(f"Found vocab files: {self.files}")
def _select_file(self, vocabtype: Optional[str]) -> Path: def _select_file(self, vocabtype: str | None) -> Path:
if vocabtype in ["spm", "bpe"]: if vocabtype in ["spm", "bpe"]:
for file_key in self.files.keys(): for file_key in self.files.keys():
if self.files[file_key]: if (file := self.files[file_key]) is not None:
return self.files[file_key] return file
raise FileNotFoundError(f"{vocabtype} vocab not found.") raise FileNotFoundError(f"{vocabtype} vocab not found.")
elif vocabtype == "hfft": if vocabtype == "hfft":
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
return self.path return self.path
else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}") raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def _create_special_vocab( def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
self,
vocab: Vocab,
vocabtype: str,
model_parent_path: Path,
) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe" load_merges = vocabtype == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
return gguf.SpecialVocab( return gguf.SpecialVocab(
@ -1407,13 +1311,12 @@ class VocabFactory:
n_vocab=n_vocab, n_vocab=n_vocab,
) )
def load_vocab( def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
self, vocabtype: str, model_parent_path: Path
) -> Tuple[Vocab, gguf.SpecialVocab]:
path = self._select_file(vocabtype) path = self._select_file(vocabtype)
print(f"Loading vocab file '{path}', type '{vocabtype}'") print(f"Loading vocab file '{path}', type '{vocabtype}'")
added_tokens_path = path.parent / "added_tokens.json" added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab
if vocabtype == "bpe": if vocabtype == "bpe":
vocab = BpeVocab( vocab = BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None path, added_tokens_path if added_tokens_path.exists() else None
@ -1428,6 +1331,7 @@ class VocabFactory:
) )
else: else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}") raise ValueError(f"Unsupported vocabulary type {vocabtype}")
# FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab( special_vocab = self._create_special_vocab(
vocab, vocab,
vocabtype, vocabtype,
@ -1436,7 +1340,7 @@ class VocabFactory:
return vocab, special_vocab return vocab, special_vocab
def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path: def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
namestr = { namestr = {
GGMLFileType.AllF32: "f32", GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyF16: "f16",
@ -1446,8 +1350,7 @@ def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Pat
if ret in model_paths: if ret in model_paths:
sys.stderr.write( sys.stderr.write(
f"Error: Default output path ({ret}) would overwrite the input. " f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n" "Please explicitly specify a path using --outfile.\n")
)
sys.exit(1) sys.exit(1)
return ret return ret
@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
print(f"model_plus.format = {model_plus.format!r}") print(f"model_plus.format = {model_plus.format!r}")
print(f"model_plus.vocab = {model_plus.vocab!r}") print(f"model_plus.vocab = {model_plus.vocab!r}")
for name, lazy_tensor in model_plus.model.items(): for name, lazy_tensor in model_plus.model.items():
print( print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
)
def get_argument_parser() -> ArgumentParser: def main(args_in: list[str] | None = None) -> None:
output_choices = ["f32", "f16"] output_choices = ["f32", "f16"]
if np.uint32(1) == np.uint32(1).newbyteorder("<"): if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems. # We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0") output_choices.append("q8_0")
vocab_types = ["spm", "bpe", "hfft"]
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser = argparse.ArgumentParser( args = parser.parse_args(args_in)
description="Convert a LLaMa model to a GGML compatible file"
)
parser.add_argument(
"model",
type=Path,
help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
)
parser.add_argument(
"--awq-path",
type=Path,
help="Path to the Activation-aware Weight Quantization cache file",
default=None,
)
parser.add_argument(
"--dump",
action="store_true",
help="Display the model content without converting it",
)
parser.add_argument(
"--dump-single",
action="store_true",
help="Display the content of a single model file without conversion",
)
parser.add_argument(
"--vocab-only",
action="store_true",
help="Extract and output only the vocabulary",
)
parser.add_argument(
"--outtype",
choices=output_choices,
help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
)
parser.add_argument(
"--vocab-dir",
type=Path,
help="Directory containing the tokenizer.model, if separate from the model file",
)
parser.add_argument(
"--vocab-type",
choices=["spm", "bpe", "hfft"], # hfft: Hugging Face Fast Tokenizer
default="spm",
help="The vocabulary format used to define the tokenizer model (default: spm)",
)
parser.add_argument(
"--pad-vocab",
action="store_true",
help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
)
parser.add_argument(
"--outfile",
type=Path,
help="Specify the path for the output file (default is based on input)",
)
parser.add_argument(
"--ctx", type=int, help="Model training context (default is based on input)"
)
parser.add_argument(
"--concurrency",
type=int,
help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
default=DEFAULT_CONCURRENCY,
)
parser.add_argument(
"--big-endian",
action="store_true",
help="Indicate that the model is executed on a big-endian machine",
)
return parser
def main(argv: Optional[list[str]] = None) -> None:
parser = get_argument_parser()
args = parser.parse_args(argv)
if args.awq_path: if args.awq_path:
sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py")) sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
tmp_model_path = args.model / "weighted_model" tmp_model_path = args.model / "weighted_model"
if tmp_model_path.is_dir(): if tmp_model_path.is_dir():
print(f"{tmp_model_path} exists as a weighted model.") print(f"{tmp_model_path} exists as a weighted model.")
@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None:
if not args.vocab_only: if not args.vocab_only:
model_plus = load_some_model(args.model) model_plus = load_some_model(args.model)
else: else:
model_plus = ModelPlus( model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
model={}, paths=[args.model / "dummy"], format="none", vocab=None
)
if args.dump: if args.dump:
do_dump_model(model_plus) do_dump_model(model_plus)
return return
endianess = gguf.GGUFEndian.LITTLE endianess = gguf.GGUFEndian.LITTLE
if args.big_endian: if args.big_endian:
endianess = gguf.GGUFEndian.BIG endianess = gguf.GGUFEndian.BIG
@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None:
params = Params.load(model_plus) params = Params.load(model_plus)
if params.n_ctx == -1: if params.n_ctx == -1:
if args.ctx is None: if args.ctx is None:
raise Exception( raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
"The model doesn't have a context size, and you didn't specify one with --ctx\n"
"Please specify one with --ctx:\n" "Please specify one with --ctx:\n"
" - LLaMA v1: --ctx 2048\n" " - LLaMA v1: --ctx 2048\n"
" - LLaMA v2: --ctx 4096\n" " - LLaMA v2: --ctx 4096\n")
)
params.n_ctx = args.ctx params.n_ctx = args.ctx
if args.outtype: if args.outtype:
@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None:
if not args.outfile: if not args.outfile:
raise ValueError("need --outfile if using --vocab-only") raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile outfile = args.outfile
OutputFile.write_vocab_only( OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
outfile, endianess=endianess, pad_vocab=args.pad_vocab)
params,
vocab,
special_vocab,
endianess=endianess,
pad_vocab=args.pad_vocab,
)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")
return return
if model_plus.vocab is not None and args.vocab_dir is None: if model_plus.vocab is not None and args.vocab_dir is None:
vocab = model_plus.vocab vocab = model_plus.vocab
print(f"Vocab info: {vocab}")
print(f"Special vocab info: {special_vocab}")
model = model_plus.model model = model_plus.model
model = convert_model_names(model, params) model = convert_model_names(model, params)
ftype = pick_output_type(model, args.outtype) ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype) model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_output_file(model_plus.paths, ftype) outfile = args.outfile or default_outfile(model_plus.paths, ftype)
params.ftype = ftype params.ftype = ftype
print(f"Writing {outfile}, format {ftype}") print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all( OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
outfile, concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
ftype,
params,
model,
vocab,
special_vocab,
concurrency=args.concurrency,
endianess=endianess,
pad_vocab=args.pad_vocab,
)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")
if __name__ == "__main__": if __name__ == '__main__':
main(sys.argv[1:]) # Exclude the first element (script name) from sys.argv main()

View file

@ -23,6 +23,9 @@ else()
add_subdirectory(infill) add_subdirectory(infill)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(llava) add_subdirectory(llava)
if (LLAMA_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(main) add_subdirectory(main)
add_subdirectory(tokenize) add_subdirectory(tokenize)
add_subdirectory(parallel) add_subdirectory(parallel)

View file

@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234; ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max; ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = 512; ctx_params.n_batch = 2048;
ctx_params.mul_mat_q = mmq; ctx_params.mul_mat_q = mmq;
ctx_params.n_threads = params.n_threads; ctx_params.n_threads = params.n_threads;

View file

@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
std::vector<size_t> train_samples_begin; std::vector<size_t> train_samples_begin;
std::vector<size_t> train_samples_size; std::vector<size_t> train_samples_size;
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
tokenize_file(lctx, tokenize_file(lctx,
params.common.fn_train_data, params.common.fn_train_data,
params.common.sample_start, params.common.sample_start,

View file

@ -26,6 +26,7 @@ struct StatParams {
std::string ofile = "imatrix.dat"; std::string ofile = "imatrix.dat";
int n_output_frequency = 10; int n_output_frequency = 10;
int verbosity = 1; int verbosity = 1;
int keep_every = 0;
bool collect_output_weight = false; bool collect_output_weight = false;
}; };
@ -42,6 +43,9 @@ private:
int m_last_call = 0; int m_last_call = 0;
std::vector<float> m_src1_data; std::vector<float> m_src1_data;
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name) const;
void keep_imatrix(int ncall) const;
}; };
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (m_last_call % m_params.n_output_frequency == 0) { if (m_last_call % m_params.n_output_frequency == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
}
} }
} }
} else { } else {
@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (m_last_call % m_params.n_output_frequency == 0) { if (m_last_call % m_params.n_output_frequency == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
}
} }
} }
@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
} }
void IMatrixCollector::save_imatrix() const { void IMatrixCollector::save_imatrix() const {
const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(); save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
}
void IMatrixCollector::keep_imatrix(int ncall) const {
auto file_name = m_params.ofile;
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str());
}
void IMatrixCollector::save_imatrix(const char * fname) const {
std::ofstream out(fname, std::ios::binary); std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size(); int n_entries = m_stats.size();
out.write((const char*)&n_entries, sizeof(n_entries)); out.write((const char*)&n_entries, sizeof(n_entries));
@ -248,7 +269,7 @@ static void process_logits(
} }
} }
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -269,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
} }
std::vector<float> logit_history; std::vector<float> logit_history;
logit_history.resize(tokens.size());
std::vector<float> prob_history; std::vector<float> prob_history;
if (compute_ppl) {
logit_history.resize(tokens.size());
prob_history.resize(tokens.size()); prob_history.resize(tokens.size());
}
const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk_max = tokens.size() / n_ctx;
@ -288,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1); std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
if (compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx; const int start = i * n_ctx;
const int end = start + n_ctx; const int end = start + n_ctx;
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits; std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now(); const auto t_start = std::chrono::high_resolution_clock::now();
@ -321,9 +349,11 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
// restore the original token in case it was set to BOS // restore the original token in case it was set to BOS
tokens[batch_start] = token_org; tokens[batch_start] = token_org;
if (compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx); const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
} }
}
const auto t_end = std::chrono::high_resolution_clock::now(); const auto t_end = std::chrono::high_resolution_clock::now();
@ -338,16 +368,22 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
} }
if (compute_ppl) {
const int first = n_ctx/2; const int first = n_ctx/2;
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1; count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout); fflush(stdout);
logits.clear();
}
} }
printf("\n"); printf("\n");
if (compute_ppl) {
nll2 /= count; nll2 /= count;
nll /= count; nll /= count;
const double ppl = exp(nll); const double ppl = exp(nll);
@ -358,6 +394,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
} else { } else {
printf("Unexpected negative standard deviation of log(prob)\n"); printf("Unexpected negative standard deviation of log(prob)\n");
} }
}
return true; return true;
} }
@ -365,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
StatParams sparams; StatParams sparams;
bool compute_ppl = true;
std::vector<char*> args; std::vector<char*> args;
args.push_back(argv[0]); args.push_back(argv[0]);
int iarg = 1; int iarg = 1;
@ -381,13 +419,22 @@ int main(int argc, char ** argv) {
} }
else if (arg == "--verbosity") { else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]); sparams.verbosity = std::stoi(argv[++iarg]);
} else if (arg == "--no-ppl") {
compute_ppl = false;
} else if (arg == "--keep-imatrix") {
sparams.keep_every = std::stoi(argv[++iarg]);
} else { } else {
args.push_back(argv[iarg]); args.push_back(argv[iarg]);
} }
} }
if (iarg < argc) { if (iarg < argc) {
std::string arg{argv[iarg]};
if (arg == "--no-ppl") {
compute_ppl = false;
} else {
args.push_back(argv[iarg]); args.push_back(argv[iarg]);
} }
}
gpt_params params; gpt_params params;
params.n_batch = 512; params.n_batch = 512;
@ -448,7 +495,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", get_system_info(params).c_str());
} }
bool OK = compute_imatrix(ctx, params); bool OK = compute_imatrix(ctx, params, compute_ppl);
if (!OK) { if (!OK) {
return 1; return 1;
} }

View file

@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
LOG("add_bos: %d\n", add_bos); LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape; bool suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1); params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false; suff_rm_leading_spc = false;
} }

View file

@ -562,6 +562,7 @@ struct test {
static const int build_number; static const int build_number;
static const bool cuda; static const bool cuda;
static const bool opencl; static const bool opencl;
static const bool vulkan;
static const bool metal; static const bool metal;
static const bool gpu_blas; static const bool gpu_blas;
static const bool blas; static const bool blas;
@ -643,6 +644,9 @@ struct test {
if (opencl) { if (opencl) {
return "OpenCL"; return "OpenCL";
} }
if (vulkan) {
return "Vulkan";
}
if (metal) { if (metal) {
return "Metal"; return "Metal";
} }
@ -658,7 +662,7 @@ struct test {
static const std::vector<std::string> & get_fields() { static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = { static const std::vector<std::string> fields = {
"build_commit", "build_number", "build_commit", "build_number",
"cuda", "opencl", "metal", "gpu_blas", "blas", "cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v", "n_batch", "n_threads", "type_k", "type_v",
@ -682,7 +686,7 @@ struct test {
field == "avg_ns" || field == "stddev_ns") { field == "avg_ns" || field == "stddev_ns") {
return INT; return INT;
} }
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
return BOOL; return BOOL;
} }
@ -710,7 +714,7 @@ struct test {
} }
std::vector<std::string> values = { std::vector<std::string> values = {
build_commit, std::to_string(build_number), build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
@ -738,6 +742,7 @@ const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER; const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::metal = !!ggml_cpu_has_metal(); const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas(); const bool test::blas = !!ggml_cpu_has_blas();

View file

@ -30,6 +30,7 @@ android {
} }
externalNativeBuild { externalNativeBuild {
cmake { cmake {
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf() cppFlags += listOf()
arguments += listOf() arguments += listOf()
} }

View file

@ -6,7 +6,7 @@
" Similarly, you could add an insert mode keybind with " Similarly, you could add an insert mode keybind with
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR> " inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
" "
" g:llama_api_url and g:llama_overrides can be configured in your .vimrc " g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
" let g:llama_api_url = "192.168.1.10:8080" " let g:llama_api_url = "192.168.1.10:8080"
" llama_overrides can also be set through buffer/window scopes. For instance " llama_overrides can also be set through buffer/window scopes. For instance
" autocmd filetype python let b:llama_overrides = {"temp": 0.2} " autocmd filetype python let b:llama_overrides = {"temp": 0.2}
@ -82,6 +82,9 @@ func llama#doLlamaGen()
endif endif
let l:querydata.prompt = join(l:buflines, "\n") let l:querydata.prompt = join(l:buflines, "\n")
let l:curlcommand = copy(s:curlcommand) let l:curlcommand = copy(s:curlcommand)
if exists("g:llama_api_key")
call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
endif
let l:curlcommand[2] = json_encode(l:querydata) let l:curlcommand[2] = json_encode(l:querydata)
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
endfunction endfunction

View file

@ -0,0 +1,131 @@
# MobileVLM
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
## Usage
Build with cmake or run `make llava-cli` to build it.
After building, run: `./llava-cli` to see the usage. For example:
```sh
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
--image path/to/an/image.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
```
## Model conversion
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
```sh
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
```
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
```
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert-image-encoder-to-gguf \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \
--projector-type ldp
```
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./convert.py path/to/MobileVLM-1.7B
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
```sh
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
```
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
## Android compile and run
### compile
refer to `examples/llava/android/build_64.sh`
```sh
mkdir examples/llava/android/build_64
cd examples/llava/android/build_64
../build_64.sh
```
### run on Android
refer to `android/adb_run.sh`, modify resources' `name` and `path`
## some result on Android with `Snapdragon 888` chip
### case 1
**input**
```sh
/data/local/tmp/llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
--image /data/local/tmp/demo.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch)
Susan Wise Bauer
llama_print_timings: load time = 23574.72 ms
llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second)
llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second)
llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second)
llama_print_timings: total time = 34731.93 ms
```
### case 2
**input**
```sh
/data/local/tmp/llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
--image /data/local/tmp/cat.jpeg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
The image depicts a cat sitting in the grass near some tall green plants.
llama_print_timings: load time = 23257.32 ms
llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second)
llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second)
llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second)
llama_print_timings: total time = 34570.79 ms
```
## Minor shortcomings
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
## TODO
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
- [ ] Optimize LDP projector performance
- Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
- Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
- [ ] run MobileVLM on `Jetson Orin`
- [ ] Support more model variants, such as `MobileVLM-3B`.
## contributor
```sh
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
```

View file

@ -0,0 +1,53 @@
#!/bin/bash
model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
projector_name="mmproj-model-f16.gguf"
llama_name="ggml-model-q4_k.gguf"
img_dir="/Users/cxt/model/llm"
img_name="demo.jpg"
prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
# img_name="cat.jpeg"
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
program_dir="build_64/bin"
binName="llava-cli"
n_threads=4
deviceDir="/data/local/tmp"
saveDir="output"
if [ ! -d ${saveDir} ]; then
mkdir ${saveDir}
fi
function android_run() {
# # copy resource into device
# adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
# adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
# copy program into device
adb push ${program_dir}/${binName} ${deviceDir}/${binName}
adb shell "chmod 0777 ${deviceDir}/${binName}"
# run
adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
-m ${deviceDir}/${llama_name} \
--mmproj ${deviceDir}/${projector_name} \
-t ${n_threads} \
--image ${deviceDir}/${img_name} \
-p \"${prompt}\" \
> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
-m ${deviceDir}/${llama_name} \
--mmproj ${deviceDir}/${projector_name} \
-t ${n_threads} \
--image ${deviceDir}/${img_name} \
-p \"${prompt}\" \
>> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
}
android_run
echo "android_run is Done!"

View file

@ -0,0 +1,8 @@
#!/bin/bash
cmake ../../../../ \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DANDROID_ABI="arm64-v8a" \
-DANDROID_PLATFORM=android-23 $1
make -j4

View file

@ -2,17 +2,6 @@
// so there might be still unnecessary artifacts hanging around // so there might be still unnecessary artifacts hanging around
// I'll gradually clean and extend it // I'll gradually clean and extend it
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <regex>
#include <stdexcept>
#include <vector>
#include "clip.h" #include "clip.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
@ -29,6 +18,19 @@
#define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h" #include "stb_image.h"
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <regex>
#include <stdexcept>
#include <vector>
#include <sstream>
#include <cinttypes>
static std::string format(const char * fmt, ...) { static std::string format(const char * fmt, ...) {
va_list ap; va_list ap;
va_list ap2; va_list ap2;
@ -67,6 +69,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
// //
// tensor name constants // tensor name constants
@ -89,6 +92,22 @@ static std::string format(const char * fmt, ...) {
#define TN_TEXT_PROJ "text_projection.weight" #define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight" #define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s" #define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
};
// //
// utilities to get data from a gguf file // utilities to get data from a gguf file
@ -129,6 +148,91 @@ static std::string get_ftype(int ftype) {
return ggml_type_name(static_cast<ggml_type>(ftype)); return ggml_type_name(static_cast<ggml_type>(ftype));
} }
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return format("unknown type %d", type);
}
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos);
break;
}
result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos;
}
s = std::move(result);
}
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
size_t tensor_size = ggml_nbytes(tensor);
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
}
static projector_type clip_projector_type_from_string(const std::string & name) {
for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
if (kv.second == name) {
return kv.first;
}
}
return PROJECTOR_TYPE_UNKNOWN;
}
// //
// image data // image data
// //
@ -201,10 +305,44 @@ struct clip_vision_model {
struct ggml_tensor * projection; struct ggml_tensor * projection;
// LLaVA projection // LLaVA projection
struct ggml_tensor * mm_0_w; struct ggml_tensor * mm_0_w = NULL;
struct ggml_tensor * mm_0_b; struct ggml_tensor * mm_0_b = NULL;
struct ggml_tensor * mm_2_w; struct ggml_tensor * mm_2_w = NULL;
struct ggml_tensor * mm_2_b; struct ggml_tensor * mm_2_b = NULL;
// Yi type models with mlp+normalization projection
struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
struct ggml_tensor * mm_1_b = NULL;
struct ggml_tensor * mm_3_w = NULL;
struct ggml_tensor * mm_3_b = NULL;
struct ggml_tensor * mm_4_w = NULL;
struct ggml_tensor * mm_4_b = NULL;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w;
struct ggml_tensor * mm_model_mlp_1_b;
struct ggml_tensor * mm_model_mlp_3_w;
struct ggml_tensor * mm_model_mlp_3_b;
struct ggml_tensor * mm_model_block_1_block_0_0_w;
struct ggml_tensor * mm_model_block_1_block_0_1_w;
struct ggml_tensor * mm_model_block_1_block_0_1_b;
struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
struct ggml_tensor * mm_model_block_1_block_2_0_w;
struct ggml_tensor * mm_model_block_1_block_2_1_w;
struct ggml_tensor * mm_model_block_1_block_2_1_b;
struct ggml_tensor * mm_model_block_2_block_0_0_w;
struct ggml_tensor * mm_model_block_2_block_0_1_w;
struct ggml_tensor * mm_model_block_2_block_0_1_b;
struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
struct ggml_tensor * mm_model_block_2_block_2_0_w;
struct ggml_tensor * mm_model_block_2_block_2_1_w;
struct ggml_tensor * mm_model_block_2_block_2_1_b;
}; };
struct clip_ctx { struct clip_ctx {
@ -213,6 +351,7 @@ struct clip_ctx {
bool has_llava_projector = false; bool has_llava_projector = false;
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
float image_mean[3]; float image_mean[3];
float image_std[3]; float image_std[3];
@ -330,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// pre-layernorm // pre-layernorm
{ {
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
} }
@ -430,9 +570,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
free(patches_data); free(patches_data);
} }
// shape [1, 576, 1024]
// ne is whcn, ne = [1024, 576, 1, 1]
embeddings = ggml_get_rows(ctx0, embeddings, patches); embeddings = ggml_get_rows(ctx0, embeddings, patches);
// mm projection 0 // print_tensor_info(embeddings, "embeddings");
// llava projector
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@ -440,6 +585,141 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
// First LayerNorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
model.mm_1_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
// Second LayerNorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
model.mm_4_b);
}
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projector
int n_patch = 24;
struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
mlp_1 = ggml_gelu(ctx0, mlp_1);
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
// block 1
struct ggml_tensor * block_1 = nullptr;
{
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
// layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// residual
block_1 = ggml_add(ctx0, mlp_3, block_1);
}
// block_2
{
// stride = 2
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
// not sure the parameters is right for globalAvgPooling
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
}
embeddings = block_1;
}
else {
GGML_ASSERT(false);
}
} }
// build the graph // build the graph
@ -485,16 +765,47 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("\n"); printf("\n");
} }
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
// kv // kv
if (verbosity >= 3) {
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname);
{
std::map<enum ggml_type, uint32_t> n_type;
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_tensors; i++) {
const char * key = gguf_get_key(ctx, i); enum ggml_type type = gguf_get_tensor_type(ctx, i);
printf("%s: kv[%d]: key = %s\n", __func__, i, key); n_type[type]++;
}
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(ctx, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
}
replace_all(value, "\n", "\\n");
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
}
// print type counts
for (auto & kv : n_type) {
if (kv.second == 0) {
continue;
}
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
} }
printf("\n");
} }
// data // data
@ -503,12 +814,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
enum ggml_type type = gguf_get_tensor_type(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(meta, name); struct ggml_tensor * cur = ggml_get_tensor(meta, name);
size_t tensor_size = ggml_nbytes(cur); size_t tensor_size = ggml_nbytes(cur);
buffer_size += tensor_size; buffer_size += tensor_size;
if (verbosity >= 3) { if (verbosity >= 3) {
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i, printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
ggml_n_dims(cur), cur->name, tensor_size, offset); __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
} }
} }
} }
@ -517,6 +829,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
clip_ctx * new_clip = new clip_ctx; clip_ctx * new_clip = new clip_ctx;
// update projector type
{
int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
if (idx != -1) {
const std::string proj_type = gguf_get_val_str(ctx, idx);
new_clip->proj_type = clip_projector_type_from_string(proj_type);
}
else {
new_clip->proj_type = PROJECTOR_TYPE_MLP;
}
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
}
}
}
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__); printf("%s: CLIP using CUDA backend\n", __func__);
@ -661,10 +990,63 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
// LLaVA projection
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
try {
// Yi-type llava
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
} catch (std::runtime_error & e) { }
try {
// missing in Yi-type llava
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
} catch (std::runtime_error & e) { }
try {
// Yi-type llava
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
} catch (std::runtime_error & e) { }
try {
// Yi-type llava
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
} catch (std::runtime_error & e) { }
}
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
vision_model.layers.resize(hparams.n_layer); vision_model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) { for (int il = 0; il < hparams.n_layer; ++il) {
@ -949,7 +1331,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
".*weight", ".*weight",
}; };
std::vector<uint8_t> read_data(512);
std::vector<uint8_t> work(512); std::vector<uint8_t> work(512);
std::vector<float> conv_buf(512); std::vector<float> conv_buf(512);
std::vector<int64_t> hist_all(1 << 4, 0); std::vector<int64_t> hist_all(1 << 4, 0);
@ -1100,13 +1481,27 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
} }
int clip_n_mmproj_embd(const struct clip_ctx * ctx) { int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0]; return ctx->vision_model.mm_2_b->ne[0];
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
return ctx->vision_model.mm_3_b->ne[0];
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
} }
int clip_n_patches(const struct clip_ctx * ctx) { int clip_n_patches(const struct clip_ctx * ctx) {
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size); if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
n_patches /= 4;
}
return n_patches;
} }
size_t clip_embd_nbytes(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) {

View file

@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
ap.add_argument("--clip_model_is_vision", action="store_true", required=False, ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
fout.add_description("vision-only CLIP model") fout.add_description("vision-only CLIP model")
elif has_llava_projector: elif has_llava_projector:
fout.add_description("image encoder for LLaVA") fout.add_description("image encoder for LLaVA")
# add projector type
fout.add_string("clip.projector_type", args.projector_type)
else: else:
fout.add_description("two-tower CLIP model") fout.add_description("two-tower CLIP model")
@ -218,7 +221,8 @@ if has_llava_projector:
projector = torch.load(args.llava_projector) projector = torch.load(args.llava_projector)
for name, data in projector.items(): for name, data in projector.items():
name = get_tensor_name(name) name = get_tensor_name(name)
if data.ndim == 2: # pw and dw conv ndim==4
if data.ndim == 2 or data.ndim == 4:
data = data.squeeze().numpy().astype(np.float16) data = data.squeeze().numpy().astype(np.float16)
else: else:
data = data.squeeze().numpy().astype(np.float32) data = data.squeeze().numpy().astype(np.float32)

View file

@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:" std::string system_prompt, user_prompt;
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos); size_t image_pos = prompt.find("<image>");
if (image_pos != std::string::npos) {
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
// We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
size_t pos = 0;
while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
user_prompt.replace(pos, 2, "\n");
pos += 1; // Advance past the replaced newline
}
while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
system_prompt.replace(pos, 2, "\n");
pos += 1; // Advance past the replaced newline
}
printf("system_prompt: %s\n", system_prompt.c_str());
printf("user_prompt: %s\n", user_prompt.c_str());
} else {
// llava-1.5 native mode
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:";
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
// generate the response // generate the response
@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
if (strcmp(tmp, "</s>") == 0) break; if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp); printf("%s", tmp);
fflush(stdout); fflush(stdout);

View file

@ -112,6 +112,43 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
} }
static inline int nearest_int(float fval) {
//assert(fval <= 4194303.f);
float val = fval + 12582912.f;
int i; memcpy(&i, &val, sizeof(int));
return (i & 0x007fffff) - 0x00400000;
}
static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
float max_logit = logits[0];
float min_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) {
max_logit = std::max(max_logit, logits[i]);
min_logit = std::min(min_logit, logits[i]);
}
min_logit = std::max(min_logit, max_logit - 16);
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
const float log_sum_exp = log(sum_exp);
const float min_log_prob = min_logit - max_logit - log_sum_exp;
const float scale = (max_logit - min_logit)/65535.f;
float * d = (float *)log_prob;
d[0] = scale;
d[1] = min_log_prob;
log_prob += 4;
if (scale) {
const float inv_scale = 1/scale;
for (int i = 0; i < n_vocab; ++i) {
log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
}
} else {
std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
}
return max_logit + log_sum_exp - logits[tok];
}
static void process_logits( static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history double & nll, double & nll2, float * logit_history, float * prob_history
@ -147,6 +184,130 @@ static void process_logits(
} }
} }
static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
std::mutex mutex;
const int nv = 2*((n_vocab + 1)/2) + 4;
int counter = 0;
auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
double local_nll = 0;
double local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
nll += local_nll; nll2 += local_nll2;
break;
}
lock.unlock();
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
local_nll += v;
local_nll2 += v*v;
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
}
struct kl_divergence_result {
double sum_nll = 0;
double sum_nll2 = 0;
double sum_kld = 0;
double sum_kld2 = 0;
double sum_nll_diff = 0;
double sum_nll_diff2 = 0;
size_t n_same_top = 0;
size_t count = 0;
};
static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
float max_logit = logits[0];
int imax = 0;
for (int i = 1; i < n_vocab; ++i) {
if (logits[i] > max_logit) {
max_logit = logits[i];
imax = i;
}
}
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
const float log_sum_exp = log(sum_exp);
const float * d = (const float *)base_log_prob;
const float scale = d[0];
const float min_log_prob = d[1];
base_log_prob += 4;
float nll = max_logit + log_sum_exp - logits[tok];
kld.sum_nll += nll;
kld.sum_nll2 += nll*nll;
nll += (scale*base_log_prob[tok] + min_log_prob);
kld.sum_nll_diff += nll;
kld.sum_nll_diff2 += nll*nll;
max_logit += log_sum_exp;
double sum = 0;
int imax_base = -1;
float p_log_base_max = 0;
for (int i = 0; i < n_vocab; ++i) {
const float p_log_base = scale*base_log_prob[i] + min_log_prob;
if (i == 0 || p_log_base > p_log_base_max) {
p_log_base_max = p_log_base;
imax_base = i;
}
if (p_log_base > -16.f) {
const float p_base = expf(p_log_base);
sum += p_base * (p_log_base - logits[i] + max_logit);
}
}
kld.sum_kld += sum;
kld.sum_kld2 += sum*sum;
++kld.count;
if (imax == imax_base) ++kld.n_same_top;
return sum;
}
static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
float * kld_values) {
std::mutex mutex;
const int nv = 2*((n_vocab + 1)/2) + 4;
int counter = 0;
auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
kl_divergence_result local_kld;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
kld.sum_nll += local_kld.sum_nll;
kld.sum_nll2 += local_kld.sum_nll2;
kld.sum_kld += local_kld.sum_kld;
kld.sum_kld2 += local_kld.sum_kld2;
kld.sum_nll_diff += local_kld.sum_nll_diff;
kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
kld.n_same_top += local_kld.n_same_top;
kld.count += local_kld.count;
break;
}
lock.unlock();
double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
kld_values[i] = (float)v;
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
}
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@ -294,6 +455,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str());
if (!logits_stream.is_open()) {
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {};
}
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8);
logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
}
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -336,6 +509,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1); std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
std::vector<uint16_t> log_probs;
if (!params.logits_file.empty()) {
logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
const int nv = 2*((n_vocab + 1)/2) + 4;
log_probs.resize(n_ctx * nv);
}
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx; const int start = i * n_ctx;
const int end = start + n_ctx; const int end = start + n_ctx;
@ -398,8 +580,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// process the entire prompt. // process the entire prompt.
const int first = n_ctx/2; const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
if (!params.logits_file.empty()) {
process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs, nll, nll2);
} else {
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
}
count += n_ctx - first - 1; count += n_ctx - first - 1;
// perplexity is e^(average negative log-likelihood) // perplexity is e^(average negative log-likelihood)
@ -458,23 +645,24 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
return true; return true;
} }
#define K_TOKEN_CHUNK 4
static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers, static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) { const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
constexpr int k_token_chunk = 4;
if (eval_results.size() != eval_pairs.size()) { if (eval_results.size() != eval_pairs.size()) {
eval_results.resize(eval_pairs.size()); eval_results.resize(eval_pairs.size());
} }
if (eval_pairs.empty()) return; if (eval_pairs.empty()) return;
size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size()); size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
std::atomic<int> counter(0); std::atomic<int> counter(0);
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () { auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
float local_logprobs[k_token_chunk]; float local_logprobs[K_TOKEN_CHUNK];
while (true) { while (true) {
size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed); size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
if (first >= eval_results.size()) break; if (first >= eval_results.size()) break;
size_t last = std::min(first + k_token_chunk, eval_results.size()); size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
for (size_t i = first; i < last; ++i) { for (size_t i = first; i < last; ++i) {
auto logits = batch_logits + eval_pairs[i].first * n_vocab; auto logits = batch_logits + eval_pairs[i].first * n_vocab;
float max_logit = logits[0]; float max_logit = logits[0];
@ -497,7 +685,6 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
for (size_t it = 0; it < max_threads; ++it) { for (size_t it = 0; it < max_threads; ++it) {
workers[it].join(); workers[it].join();
} }
} }
static void hellaswag_score(llama_context * ctx, const gpt_params & params) { static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
@ -540,14 +727,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// This is needed as usual for LLaMA models // This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
// Number of tasks to use when computing the score // Number of tasks to use when computing the score
if (params.hellaswag_tasks < hs_task_count) { if (params.hellaswag_tasks < hs_task_count) {
hs_task_count = params.hellaswag_tasks; hs_task_count = params.hellaswag_tasks;
} }
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
// The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
std::mt19937 rng(1); std::mt19937 rng(1);
@ -1031,6 +1218,566 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
} }
static bool deserialize_string(std::istream & in, std::string & str) {
uint32_t size;
if (!in.read((char *)&size, sizeof(size)).fail()) {
str.resize(size);
if (!in.read((char *)&str[0], size).fail()) return true;
}
return false;
}
struct multiple_choice_answers {
std::vector<std::string> answers;
std::vector<int> labels;
bool deserialize(std::istream& in) {
uint32_t n;
in.read((char *)&n, sizeof(n));
if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
answers.resize(n);
labels.resize(n);
for (auto& a : answers) {
if (!deserialize_string(in, a)) return false;
}
in.read((char *)labels.data(), n*sizeof(int));
return !in.fail();
}
};
struct multiple_choice_task {
std::string question; // the question (or context that needs to be continued)
multiple_choice_answers mc1; // possible answers (continuations) with a single correct answer
multiple_choice_answers mc2; // possible answers (continuations) with multiple correct answers - not handled yet
bool deserialize(std::istream& in) {
if (!deserialize_string(in, question)) return false;
return mc1.deserialize(in) && mc2.deserialize(in);
}
// For evaluation
size_t i_batch; // starting index in the llama_batch
size_t common_prefix; // max number of initial tokens that are the same in all sentences
size_t required_tokens; // needed number of tokens to evaluate all answers
std::vector<std::vector<llama_token>> seq_tokens;
std::vector<float> log_probs;
};
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) {
printf("%s: found bad task with empty question and/or answers\n", __func__);
}
return false;
}
task.seq_tokens.reserve(task.mc1.answers.size());
for (auto& answer : task.mc1.answers) {
if (answer.empty()) {
if (log_error) {
printf("%s: found empty answer\n", __func__);
}
return false;
}
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
}
auto min_len = task.seq_tokens.front().size();
for (auto& seq : task.seq_tokens) {
min_len = std::min(min_len, seq.size());
}
task.common_prefix = 0;
for (size_t k = 0; k < min_len; ++k) {
auto token = task.seq_tokens[0][k];
bool all_same = true;
for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
if (task.seq_tokens[i][k] != token) {
all_same = false;
break;
}
}
if (!all_same) {
break;
}
++task.common_prefix;
}
task.required_tokens = task.common_prefix;
for (auto& seq : task.seq_tokens) {
task.required_tokens += seq.size() - task.common_prefix;
}
return true;
}
//
// Calculates score for multiple choice tasks with single correct answer from prompt.
// Commonly used LLM evaluation metrics of this type are
// * ARC
// * HellaSwag
// * MMLU
// * TruthfulQA
//
// Validation datasets for these 4 tests can be found at
// https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
// The data for these datasets was extracted from
// git@hf.co:datasets/allenai/ai2_arc
// https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
// git@hf.co:datasets/Stevross/mmlu
// https://huggingface.co/datasets/truthful_qa
//
static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
std::istringstream strstream(params.prompt);
uint32_t n_task;
strstream.read((char *)&n_task, sizeof(n_task));
if (strstream.fail() || n_task == 0) {
printf("%s: no tasks\n", __func__);
return;
}
printf("%s: there are %u tasks in prompt\n", __func__, n_task);
std::vector<uint32_t> task_pos(n_task);
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
if (strstream.fail()) {
printf("%s: failed to raad task positions from prompt\n", __func__);
return;
}
std::vector<multiple_choice_task> tasks;
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
// Use all tasks
tasks.resize(n_task);
printf("%s: reading tasks", __func__);
int n_dot = n_task/100;
int i = 0;
for (auto& task : tasks) {
++i;
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
return;
}
if (i%n_dot == 0) printf(".");
}
printf("done\n");
}
else {
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
std::mt19937 rng(1);
std::vector<int> aux(n_task);
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
float scale = 1.f/(1.f + (float)std::mt19937::max());
tasks.resize(params.multiple_choice_tasks);
for (auto& task : tasks) {
int j = (int)(scale * rng() * aux.size());
int idx = aux[j];
aux[j] = aux.back();
aux.pop_back();
strstream.seekg(task_pos[idx], std::ios::beg);
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
return;
}
}
n_task = params.multiple_choice_tasks;
}
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
printf("%s: preparing task data", __func__);
fflush(stdout);
if (n_task > 500) {
printf("...");
fflush(stdout);
std::atomic<int> counter(0);
std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
int num_tasks = tasks.size();
int n_bad_local = 0;
while (true) {
int first = counter.fetch_add(K_TOKEN_CHUNK);
if (first >= num_tasks) {
if (n_bad_local > 0) n_bad += n_bad_local;
break;
}
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
for (int i = first; i < last; ++i) {
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
}
}
};
size_t max_thread = std::thread::hardware_concurrency();
max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
std::vector<std::thread> workers(max_thread-1);
for (auto& w : workers) w = std::thread(prepare);
prepare();
for (auto& w : workers) w.join();
printf("done\n");
fflush(stdout);
int nbad = n_bad;
if (nbad > 0) {
printf("%s: found %d malformed tasks\n", __func__, nbad);
return;
}
} else {
int n_dot = n_task/100;
int i_task = 0;
for (auto& task : tasks) {
++i_task;
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
return;
}
if (i_task%n_dot == 0) {
printf(".");
fflush(stdout);
}
}
printf("done\n");
}
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
printf("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
const int max_seq = 4*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
std::vector<std::thread> workers(std::thread::hardware_concurrency());
std::vector<int> batch_indeces;
int n_done = 0;
int n_correct = 0;
int n_tot_answers = 0;
for (size_t i0 = 0; i0 < tasks.size(); i0++) {
int n_cur = 0;
size_t i1 = i0;
size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
llama_batch_clear(batch);
// batch as much tasks as possible into the available context
// each task has 4 unique seuqnce ids - one for each ending
// the common prefix is shared among the 4 sequences to save tokens
// we extract logits only from the last common token and from all ending tokens of each sequence
int s0 = 0;
while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
auto& cur_task = tasks[i1];
int num_answers = cur_task.seq_tokens.size();
if (s0 + num_answers > max_seq) {
break;
}
if (int(batch_indeces.size()) != num_answers) {
batch_indeces.resize(num_answers);
}
for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
}
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
}
}
s0 += num_answers;
cur_task.i_batch = i_batch;
i_batch += cur_task.required_tokens;
n_cur += cur_task.required_tokens;
if (++i1 == tasks.size()) {
break;
}
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
llama_kv_cache_clear(ctx);
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
return;
}
// Compute log-probs in parallel
// First we collect all tasks
eval_pairs.clear();
for (size_t i = i0; i < i1; ++i) {
auto& cur_task = tasks[i];
size_t li = cur_task.common_prefix;
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
}
++li;
}
}
// Then we do the actual calculation
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
size_t ir = 0;
// compute the logprobs for each ending of the decoded tasks
for (size_t i = i0; i < i1; ++i) {
auto & cur_task = tasks[i];
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
// if (cur_task.mc1.labels[j] == 1) {
// printf("%d", j+1);
// }
//}
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
cur_task.log_probs.resize(cur_task.seq_tokens.size());
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
size_t count = 1;
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
//printf(" %zu %g\n", ir, eval_results[ir]);
++count;
log_prob += eval_results[ir++];
}
cur_task.log_probs[s] = log_prob / count;
//printf(" Final: %g\n", log_prob / count);
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
}
// Find the ending with maximum logprob
size_t logprob_max_idx = 0;
float logprob_max_val = cur_task.log_probs[0];
for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
if (cur_task.log_probs[s] > logprob_max_val) {
logprob_max_val = cur_task.log_probs[s];
logprob_max_idx = s;
}
}
n_tot_answers += cur_task.log_probs.size();
if (cur_task.mc1.labels[logprob_max_idx] == 1) {
++n_correct;
}
++n_done;
// Print the accumulated accuracy mean x 100
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
fflush(stdout);
}
i0 = i1 - 1;
}
llama_batch_free(batch);
if (n_done < 100) return;
float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1));
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
p = 1.f*n_done/n_tot_answers;
sigma = sqrt(p*(1-p)/(n_done-1));
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
printf("\n");
}
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (params.logits_file.empty()) {
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return;
}
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
if (!in) {
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
return;
}
{
char check[9]; check[8] = 0;
in.read(check, 8);
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
return;
}
}
uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
}
int n_vocab, n_chunk;
in.read((char *)&n_vocab, sizeof(n_vocab));
in.read((char *)&n_chunk, sizeof(n_chunk));
if (in.fail()) {
fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
return;
}
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
}
std::vector<llama_token> tokens(n_ctx * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return;
}
const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> logits;
if (num_batches > 1) {
logits.reserve(n_ctx * n_vocab);
}
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
if (count < 1) {
return std::make_pair(0., 0.);
}
double f = sum/count;
double df = sum2/count - f*f;
df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
return std::make_pair(f, df);
};
kl_divergence_result kld;
auto kld_ptr = kld_values.data();
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const auto t_start = std::chrono::high_resolution_clock::now();
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
return;
}
// clear the KV cache
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n");
}
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld, kld_ptr);
kld_ptr += n_ctx - 1 - first;
auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
auto p_top = 1.*kld.n_same_top/kld.count;
auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
printf("%4d %10.4lf %10.5lf ± %10.5f %10.5f ± %10.5lf %.5f ± %.5f\n", i+1, exp(ppl.first),
log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
p_top, d_p_top);
fflush(stdout);
logits.clear();
}
printf("\n");
if (kld.count < 100) return; // we do not wish to do statistics on so few values
std::sort(kld_values.begin(), kld_values.end());
printf("===== KL-divergence statistics\n");
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
: kld_values[kld_values.size()/2];
printf("Median : %10.6f\n", kld_median);
auto percentile = [&kld_values] (float fraction) {
if (fraction <= 0) return kld_values.front();
if (fraction >= 1) return kld_values.back();
float p = fraction*(kld_values.size() - 1);
size_t ip = size_t(p); p -= ip;
return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
};
printf("Maximum: %10.6f\n", kld_values.back());
printf("KLD_99 : %10.6f\n", percentile(0.99f));
printf("KLD_95 : %10.6f\n", percentile(0.95f));
printf("KLD_90 : %10.6f\n", percentile(0.90f));
printf("Minimum: %10.6f\n", kld_values.front());
printf("KLD_01 : %10.6f\n", percentile(0.01f));
printf("KLD_05 : %10.6f\n", percentile(0.05f));
printf("KLD_10 : %10.6f\n", percentile(0.10f));
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
@ -1091,6 +1838,10 @@ int main(int argc, char ** argv) {
hellaswag_score(ctx, params); hellaswag_score(ctx, params);
} else if (params.winogrande) { } else if (params.winogrande) {
winogrande_score(ctx, params); winogrande_score(ctx, params);
} else if (params.multiple_choice) {
multiple_choice_score(ctx, params);
} else if (params.kl_divergence) {
kl_divergence(ctx, params);
} else { } else {
results = perplexity(ctx, params); results = perplexity(ctx, params);
} }

View file

@ -1,14 +1,14 @@
# Function calling example using pydantic models. # Function calling example using pydantic models.
import datetime import datetime
import importlib
import json import json
from enum import Enum from enum import Enum
from typing import Union, Optional from typing import Optional, Union
import requests import requests
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
import importlib create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
# Function to get completion on the llama.cpp server with grammar. # Function to get completion on the llama.cpp server with grammar.
@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel):
print(self.message) print(self.message)
# Enum for the calculator function. # Enum for the calculator tool.
class MathOperation(Enum): class MathOperation(Enum):
ADD = "add" ADD = "add"
SUBTRACT = "subtract" SUBTRACT = "subtract"
@ -43,7 +43,7 @@ class MathOperation(Enum):
DIVIDE = "divide" DIVIDE = "divide"
# Very simple calculator tool for the agent. # Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
class Calculator(BaseModel): class Calculator(BaseModel):
""" """
Perform a math operation on two numbers. Perform a math operation on two numbers.
@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None):
return datetime.datetime.now().strftime(output_format) return datetime.datetime.now().strftime(output_format)
# Enum for the calculator tool.
class MathOperation(Enum):
ADD = "add"
SUBTRACT = "subtract"
MULTIPLY = "multiply"
DIVIDE = "divide"
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
class Calculator(BaseModel):
"""
Perform a math operation on two numbers.
"""
number_one: Union[int, float] = Field(..., description="First number.")
operation: MathOperation = Field(..., description="Math operation to perform.")
number_two: Union[int, float] = Field(..., description="Second number.")
def run(self):
if self.operation == MathOperation.ADD:
return self.number_one + self.number_two
elif self.operation == MathOperation.SUBTRACT:
return self.number_one - self.number_two
elif self.operation == MathOperation.MULTIPLY:
return self.number_one * self.number_two
elif self.operation == MathOperation.DIVIDE:
return self.number_one / self.number_two
else:
raise ValueError("Unknown operation.")
# Example function to get the weather # Example function to get the weather
def get_current_weather(location, unit): def get_current_weather(location, unit):
"""Get the current weather in a given location""" """Get the current weather in a given location"""

View file

@ -1,15 +1,21 @@
from __future__ import annotations
import inspect import inspect
import json import json
import re
from copy import copy from copy import copy
from inspect import isclass, getdoc from enum import Enum
from types import NoneType from inspect import getdoc, isclass
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
from docstring_parser import parse from docstring_parser import parse
from pydantic import BaseModel, create_model, Field from pydantic import BaseModel, Field, create_model
from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
from enum import Enum if TYPE_CHECKING:
from typing import get_type_hints, Callable from types import GenericAlias
import re else:
# python 3.8 compat
from typing import _GenericAlias as GenericAlias
class PydanticDataType(Enum): class PydanticDataType(Enum):
@ -43,7 +49,7 @@ class PydanticDataType(Enum):
SET = "set" SET = "set"
def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
if isclass(pydantic_type) and issubclass(pydantic_type, str): if isclass(pydantic_type) and issubclass(pydantic_type, str):
return PydanticDataType.STRING.value return PydanticDataType.STRING.value
elif isclass(pydantic_type) and issubclass(pydantic_type, bool): elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel): elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
return format_model_and_field_name(pydantic_type.__name__) return format_model_and_field_name(pydantic_type.__name__)
elif get_origin(pydantic_type) == list: elif get_origin(pydantic_type) is list:
element_type = get_args(pydantic_type)[0] element_type = get_args(pydantic_type)[0]
return f"{map_pydantic_type_to_gbnf(element_type)}-list" return f"{map_pydantic_type_to_gbnf(element_type)}-list"
elif get_origin(pydantic_type) == set: elif get_origin(pydantic_type) is set:
element_type = get_args(pydantic_type)[0] element_type = get_args(pydantic_type)[0]
return f"{map_pydantic_type_to_gbnf(element_type)}-set" return f"{map_pydantic_type_to_gbnf(element_type)}-set"
elif get_origin(pydantic_type) == Union: elif get_origin(pydantic_type) is Union:
union_types = get_args(pydantic_type) union_types = get_args(pydantic_type)
union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types] union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
return f"union-{'-or-'.join(union_rules)}" return f"union-{'-or-'.join(union_rules)}"
elif get_origin(pydantic_type) == Optional: elif get_origin(pydantic_type) is Optional:
element_type = get_args(pydantic_type)[0] element_type = get_args(pydantic_type)[0]
return f"optional-{map_pydantic_type_to_gbnf(element_type)}" return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
elif isclass(pydantic_type): elif isclass(pydantic_type):
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}" return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
elif get_origin(pydantic_type) == dict: elif get_origin(pydantic_type) is dict:
key_type, value_type = get_args(pydantic_type) key_type, value_type = get_args(pydantic_type)
return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}" return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
else: else:
@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name):
return f"{cls.__name__.lower()} ::= " + " | ".join(members) return f"{cls.__name__.lower()} ::= " + " | ".join(members)
if cls.__annotations__ and cls.__annotations__ != {}: if cls.__annotations__ and cls.__annotations__ != {}:
result = f'{rule_name} ::= "{{"' result = f'{rule_name} ::= "{{"'
type_list_rules = []
# Modify this comprehension # Modify this comprehension
members = [ members = [
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}' f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}'
@ -116,17 +121,15 @@ def get_members_structure(cls, rule_name):
result += '"," '.join(members) result += '"," '.join(members)
result += ' "}"' result += ' "}"'
return result, type_list_rules return result
elif rule_name == "custom-class-any": if rule_name == "custom-class-any":
result = f"{rule_name} ::= " result = f"{rule_name} ::= "
result += "value" result += "value"
type_list_rules = [] return result
return result, type_list_rules
else:
init_signature = inspect.signature(cls.__init__) init_signature = inspect.signature(cls.__init__)
parameters = init_signature.parameters parameters = init_signature.parameters
result = f'{rule_name} ::= "{{"' result = f'{rule_name} ::= "{{"'
type_list_rules = []
# Modify this comprehension too # Modify this comprehension too
members = [ members = [
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}' f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}'
@ -136,7 +139,7 @@ def get_members_structure(cls, rule_name):
result += '", "'.join(members) result += '", "'.join(members)
result += ' "}"' result += ' "}"'
return result, type_list_rules return result
def regex_to_gbnf(regex_pattern: str) -> str: def regex_to_gbnf(regex_pattern: str) -> str:
@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
def generate_gbnf_rule_for_type( def generate_gbnf_rule_for_type(
model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
) -> Tuple[str, list]: ) -> tuple[str, list[str]]:
""" """
Generate GBNF rule for a given field type. Generate GBNF rule for a given field type.
@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type(
:param field_info: Additional information about the field (optional). :param field_info: Additional information about the field (optional).
:return: Tuple containing the GBNF type and a list of additional rules. :return: Tuple containing the GBNF type and a list of additional rules.
:rtype: Tuple[str, list] :rtype: tuple[str, list]
""" """
rules = [] rules = []
@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type(
gbnf_type, rules = model_name + "-" + field_name, rules gbnf_type, rules = model_name + "-" + field_name, rules
elif gbnf_type.startswith("custom-class-"): elif gbnf_type.startswith("custom-class-"):
nested_model_rules, field_types = get_members_structure(field_type, gbnf_type) rules.append(get_members_structure(field_type, gbnf_type))
rules.append(nested_model_rules)
elif gbnf_type.startswith("custom-dict-"): elif gbnf_type.startswith("custom-dict-"):
key_type, value_type = get_args(field_type) key_type, value_type = get_args(field_type)
@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type(
union_rules = [] union_rules = []
for union_type in union_types: for union_type in union_types:
if isinstance(union_type, _GenericAlias): if isinstance(union_type, GenericAlias):
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
model_name, field_name, union_type, False, processed_models, created_rules model_name, field_name, union_type, False, processed_models, created_rules
) )
union_rules.append(union_gbnf_type) union_rules.append(union_gbnf_type)
rules.extend(union_rules_list) rules.extend(union_rules_list)
elif not issubclass(union_type, NoneType): elif not issubclass(union_type, type(None)):
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
model_name, field_name, union_type, False, processed_models, created_rules model_name, field_name, union_type, False, processed_models, created_rules
) )
@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type(
else: else:
gbnf_type, rules = gbnf_type, [] gbnf_type, rules = gbnf_type, []
if gbnf_type not in created_rules:
return gbnf_type, rules
else:
if gbnf_type in created_rules:
return gbnf_type, rules return gbnf_type, rules
def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool): def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]:
""" """
Generate GBnF Grammar Generate GBnF Grammar
@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
``` ```
""" """
if model in processed_models: if model in processed_models:
return [] return [], False
processed_models.add(model) processed_models.add(model)
model_name = format_model_and_field_name(model.__name__) model_name = format_model_and_field_name(model.__name__)
@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
def generate_gbnf_grammar_from_pydantic_models( def generate_gbnf_grammar_from_pydantic_models(
models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None, models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None,
list_of_outputs: bool = False list_of_outputs: bool = False
) -> str: ) -> str:
""" """
@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models(
* grammar. * grammar.
Args: Args:
models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from.
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
list_of_outputs (str, optional): Allows a list of output objects list_of_outputs (str, optional): Allows a list of output objects
@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models(
# root ::= UserModel | PostModel # root ::= UserModel | PostModel
# ... # ...
""" """
processed_models = set() processed_models: set[type[BaseModel]] = set()
all_rules = [] all_rules = []
created_rules = {} created_rules: dict[str, list[str]] = {}
if outer_object_name is None: if outer_object_name is None:
for model in models: for model in models:
model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules) model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
@ -608,7 +606,7 @@ def get_primitive_grammar(grammar):
Returns: Returns:
str: GBNF primitive grammar string. str: GBNF primitive grammar string.
""" """
type_list = [] type_list: list[type[object]] = []
if "string-list" in grammar: if "string-list" in grammar:
type_list.append(str) type_list.append(str)
if "boolean-list" in grammar: if "boolean-list" in grammar:
@ -666,14 +664,14 @@ triple-quotes ::= "'''" """
def generate_markdown_documentation( def generate_markdown_documentation(
pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
documentation_with_field_description=True documentation_with_field_description=True
) -> str: ) -> str:
""" """
Generate markdown documentation for a list of Pydantic models. Generate markdown documentation for a list of Pydantic models.
Args: Args:
pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. pydantic_models (list[type[BaseModel]]): list of Pydantic model classes.
model_prefix (str): Prefix for the model section. model_prefix (str): Prefix for the model section.
fields_prefix (str): Prefix for the fields section. fields_prefix (str): Prefix for the fields section.
documentation_with_field_description (bool): Include field descriptions in the documentation. documentation_with_field_description (bool): Include field descriptions in the documentation.
@ -731,7 +729,7 @@ def generate_markdown_documentation(
def generate_field_markdown( def generate_field_markdown(
field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
documentation_with_field_description=True documentation_with_field_description=True
) -> str: ) -> str:
""" """
@ -739,8 +737,8 @@ def generate_field_markdown(
Args: Args:
field_name (str): Name of the field. field_name (str): Name of the field.
field_type (Type[Any]): Type of the field. field_type (type[Any]): Type of the field.
model (Type[BaseModel]): Pydantic model class. model (type[BaseModel]): Pydantic model class.
depth (int): Indentation depth in the documentation. depth (int): Indentation depth in the documentation.
documentation_with_field_description (bool): Include field descriptions in the documentation. documentation_with_field_description (bool): Include field descriptions in the documentation.
@ -798,7 +796,7 @@ def generate_field_markdown(
return field_text return field_text
def format_json_example(example: dict, depth: int) -> str: def format_json_example(example: dict[str, Any], depth: int) -> str:
""" """
Format a JSON example into a readable string with indentation. Format a JSON example into a readable string with indentation.
@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str:
def generate_text_documentation( def generate_text_documentation(
pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
documentation_with_field_description=True documentation_with_field_description=True
) -> str: ) -> str:
""" """
Generate text documentation for a list of Pydantic models. Generate text documentation for a list of Pydantic models.
Args: Args:
pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. pydantic_models (list[type[BaseModel]]): List of Pydantic model classes.
model_prefix (str): Prefix for the model section. model_prefix (str): Prefix for the model section.
fields_prefix (str): Prefix for the fields section. fields_prefix (str): Prefix for the fields section.
documentation_with_field_description (bool): Include field descriptions in the documentation. documentation_with_field_description (bool): Include field descriptions in the documentation.
@ -885,7 +883,7 @@ def generate_text_documentation(
def generate_field_text( def generate_field_text(
field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
documentation_with_field_description=True documentation_with_field_description=True
) -> str: ) -> str:
""" """
@ -893,8 +891,8 @@ def generate_field_text(
Args: Args:
field_name (str): Name of the field. field_name (str): Name of the field.
field_type (Type[Any]): Type of the field. field_type (type[Any]): Type of the field.
model (Type[BaseModel]): Pydantic model class. model (type[BaseModel]): Pydantic model class.
depth (int): Indentation depth in the documentation. depth (int): Indentation depth in the documentation.
documentation_with_field_description (bool): Include field descriptions in the documentation. documentation_with_field_description (bool): Include field descriptions in the documentation.
@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
pydantic_model_list, pydantic_model_list,
grammar_file_path="./generated_grammar.gbnf", grammar_file_path="./generated_grammar.gbnf",
documentation_file_path="./generated_grammar_documentation.md", documentation_file_path="./generated_grammar_documentation.md",
outer_object_name: str = None, outer_object_name: str | None = None,
outer_object_content: str = None, outer_object_content: str | None = None,
model_prefix: str = "Output Model", model_prefix: str = "Output Model",
fields_prefix: str = "Output Fields", fields_prefix: str = "Output Fields",
list_of_outputs: bool = False, list_of_outputs: bool = False,
@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
def generate_gbnf_grammar_and_documentation( def generate_gbnf_grammar_and_documentation(
pydantic_model_list, pydantic_model_list,
outer_object_name: str = None, outer_object_name: str | None = None,
outer_object_content: str = None, outer_object_content: str | None = None,
model_prefix: str = "Output Model", model_prefix: str = "Output Model",
fields_prefix: str = "Output Fields", fields_prefix: str = "Output Fields",
list_of_outputs: bool = False, list_of_outputs: bool = False,
@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation(
def generate_gbnf_grammar_and_documentation_from_dictionaries( def generate_gbnf_grammar_and_documentation_from_dictionaries(
dictionaries: List[dict], dictionaries: list[dict[str, Any]],
outer_object_name: str = None, outer_object_name: str | None = None,
outer_object_content: str = None, outer_object_content: str | None = None,
model_prefix: str = "Output Model", model_prefix: str = "Output Model",
fields_prefix: str = "Output Fields", fields_prefix: str = "Output Fields",
list_of_outputs: bool = False, list_of_outputs: bool = False,
@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
Generate GBNF grammar and documentation from a list of dictionaries. Generate GBNF grammar and documentation from a list of dictionaries.
Args: Args:
dictionaries (List[dict]): List of dictionaries representing Pydantic models. dictionaries (list[dict]): List of dictionaries representing Pydantic models.
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
model_prefix (str): Prefix for the model section in the documentation. model_prefix (str): Prefix for the model section in the documentation.
@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
return grammar, documentation return grammar, documentation
def create_dynamic_model_from_function(func: Callable): def create_dynamic_model_from_function(func: Callable[..., Any]):
""" """
Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method. Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable):
sig = inspect.signature(func) sig = inspect.signature(func)
# Parse the docstring # Parse the docstring
assert func.__doc__ is not None
docstring = parse(func.__doc__) docstring = parse(func.__doc__)
dynamic_fields = {} dynamic_fields = {}
@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable):
f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring") f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
# Add parameter details to the schema # Add parameter details to the schema
param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
param_docs.append((param.name, param_doc)) param_docs.append((param.name, param_doc))
if param.default == inspect.Parameter.empty: if param.default == inspect.Parameter.empty:
default_value = ... default_value = ...
@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable):
dynamic_fields[param.name] = ( dynamic_fields[param.name] = (
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
# Creating the dynamic model # Creating the dynamic model
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload]
for param_doc in param_docs: for name, param_doc in param_docs:
dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description dynamic_model.model_fields[name].description = param_doc.description
dynamic_model.__doc__ = docstring.short_description dynamic_model.__doc__ = docstring.short_description
@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable):
return dynamic_model return dynamic_model
def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]):
""" """
Add a 'run' method to a dynamic Pydantic model, using the provided function. Add a 'run' method to a dynamic Pydantic model, using the provided function.
Args: Args:
model (Type[BaseModel]): Dynamic Pydantic model class. model (type[BaseModel]): Dynamic Pydantic model class.
func (Callable): Function to be added as a 'run' method to the model. func (Callable): Function to be added as a 'run' method to the model.
Returns: Returns:
Type[BaseModel]: Pydantic model class with the added 'run' method. type[BaseModel]: Pydantic model class with the added 'run' method.
""" """
def run_method_wrapper(self): def run_method_wrapper(self):
@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
return model return model
def create_dynamic_models_from_dictionaries(dictionaries: List[dict]): def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]):
""" """
Create a list of dynamic Pydantic model classes from a list of dictionaries. Create a list of dynamic Pydantic model classes from a list of dictionaries.
Args: Args:
dictionaries (List[dict]): List of dictionaries representing model structures. dictionaries (list[dict]): List of dictionaries representing model structures.
Returns: Returns:
List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. list[type[BaseModel]]: List of generated dynamic Pydantic model classes.
""" """
dynamic_models = [] dynamic_models = []
for func in dictionaries: for func in dictionaries:
@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values):
return Enum(enum_name, {value: value for value in values}) return Enum(enum_name, {value: value for value in values})
def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]: def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]:
""" """
Convert a dictionary to a Pydantic model class. Convert a dictionary to a Pydantic model class.
@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
model_name (str): Name of the generated Pydantic model. model_name (str): Name of the generated Pydantic model.
Returns: Returns:
Type[BaseModel]: Generated Pydantic model class. type[BaseModel]: Generated Pydantic model class.
""" """
fields = {} fields: dict[str, Any] = {}
if "properties" in dictionary: if "properties" in dictionary:
for field_name, field_data in dictionary.get("properties", {}).items(): for field_name, field_data in dictionary.get("properties", {}).items():
@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
if items != {}: if items != {}:
array = {"properties": items} array = {"properties": items}
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
fields[field_name] = (List[array_type], ...) fields[field_name] = (List[array_type], ...) # type: ignore[valid-type]
else: else:
fields[field_name] = (list, ...) fields[field_name] = (list, ...)
elif field_type == "object": elif field_type == "object":

View file

@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },

View file

@ -1,7 +1,7 @@
set(TARGET server) set(TARGET server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp json.hpp httplib.h) add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>

View file

@ -30,7 +30,8 @@ Command line options:
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) - `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
## Build ## Build
server is build alongside everything else from the root of the project server is build alongside everything else from the root of the project
@ -65,6 +66,14 @@ server.exe -m models\7B\ggml-model.gguf -c 2048
The above command will start a server that by default listens on `127.0.0.1:8080`. The above command will start a server that by default listens on `127.0.0.1:8080`.
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
### Docker:
```bash
docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
# or, with CUDA:
docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
```
## Testing with CURL ## Testing with CURL
Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.

208
examples/server/oai.hpp Normal file
View file

@ -0,0 +1,208 @@
#pragma once
#include <string>
#include <vector>
#include <set>
#include <mutex>
#include <condition_variable>
#include <unordered_map>
#include "json.hpp"
#include "utils.hpp"
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
using json = nlohmann::json;
inline static json oaicompat_completion_params_parse(
const json &body /* openai api json semantics */)
{
json llama_params;
llama_params["__oaicompat"] = true;
// Map OpenAI parameters to llama.cpp parameters
//
// For parameters that are defined by the OpenAI documentation (e.g.
// temperature), we explicitly specify OpenAI's intended default; we
// need to do that because sometimes OpenAI disagrees with llama.cpp
//
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("unknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0);
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
llama_params["top_p"] = json_value(body, "top_p", 1.0);
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
llama_params["stream"] = json_value(body, "stream", false);
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
if (body.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object());
}
// Handle 'stop' field
if (body.contains("stop") && body["stop"].is_string()) {
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
} else {
llama_params["stop"] = json_value(body, "stop", json::array());
}
// Ensure there is ChatML-specific end sequence among stop words
llama_params["stop"].push_back("<|im_end|>");
return llama_params;
}
inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
{
json result = response.result_json;
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason = "length";
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
json choices =
streaming ? json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}})
: json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"message", json{{"content", content},
{"role", "assistant"}}}}});
std::time_t t = std::time(0);
json res =
json{{"choices", choices},
{"created", t},
{"model",
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
{"usage",
json{{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
{"id", gen_chatcmplid()}};
if (server_verbose) {
res["__verbose"] = result;
}
if (result.contains("completion_probabilities")) {
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
return res;
}
// return value is vector as there is one case where we might need to generate two responses
inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
json result = response.result_json;
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({response.result_json});
}
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
bool stopped_word = json_value(result, "stopped_word", false);
bool stopped_eos = json_value(result, "stopped_eos", false);
bool stopped_limit = json_value(result, "stopped_limit", false);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason;
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
if (stopped_limit) {
finish_reason = "length";
}
std::time_t t = std::time(0);
json choices;
if (!finish_reason.empty()) {
choices = json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}});
} else {
if (first) {
if (content.empty()) {
choices = json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{{"role", "assistant"}}}}});
} else {
// We have to send this as two updates to conform to openai behavior
json initial_ret = json{{"choices", json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"role", "assistant"}
}}}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
json second_ret = json{
{"choices", json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"content", content}}}
}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({initial_ret, second_ret});
}
} else {
// Some idiosyncrasy in task processing logic makes several trailing calls
// with empty content, we ignore these at the calee site.
if (content.empty()) {
return std::vector<json>({json::object()});
}
choices = json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta",
json{
{"content", content},
}},
}});
}
}
json ret = json{{"choices", choices},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({ret});
}

File diff suppressed because it is too large Load diff

508
examples/server/utils.hpp Normal file
View file

@ -0,0 +1,508 @@
#pragma once
#include <string>
#include <vector>
#include <set>
#include <mutex>
#include <condition_variable>
#include <unordered_map>
#include "json.hpp"
#include "../llava/clip.h"
using json = nlohmann::json;
extern bool server_verbose;
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
#endif
#if SERVER_VERBOSE != 1
#define LOG_VERBOSE(MSG, ...)
#else
#define LOG_VERBOSE(MSG, ...) \
do \
{ \
if (server_verbose) \
{ \
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
} \
} while (0)
#endif
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
//
// parallel
//
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
SERVER_STATE_ERROR // An error occurred, load_model failed
};
enum task_type {
TASK_TYPE_COMPLETION,
TASK_TYPE_CANCEL,
TASK_TYPE_NEXT_RESPONSE
};
struct task_server {
int id = -1; // to be filled by llama_server_queue
int target_id;
task_type type;
json data;
bool infill_mode = false;
bool embedding_mode = false;
int multitask_id = -1;
};
struct task_result {
int id;
int multitask_id = -1;
bool stop;
bool error;
json result_json;
};
struct task_multi {
int id;
std::set<int> subtasks_remaining{};
std::vector<task_result> results{};
};
// TODO: can become bool if we can't find use of more states
enum slot_state
{
IDLE,
PROCESSING,
};
enum slot_command
{
NONE,
LOAD_PROMPT,
RELEASE,
};
struct slot_params
{
bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
uint32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_predict = -1; // new tokens to predict
std::vector<std::string> antiprompt;
json input_prefix;
json input_suffix;
};
struct slot_image
{
int32_t id;
bool request_encode_image = false;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
};
// completion token output with probabilities
struct completion_token_output
{
struct token_prob
{
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
llama_token tok;
std::string text_to_send;
};
static inline void server_log(const char *level, const char *function, int line,
const char *message, const nlohmann::ordered_json &extra)
{
nlohmann::ordered_json log
{
{"timestamp", time(nullptr)},
{"level", level},
{"function", function},
{"line", line},
{"message", message},
};
if (!extra.empty())
{
log.merge_patch(extra);
}
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout);
}
//
// server utils
//
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value)
{
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
inline std::string format_chatml(std::vector<json> messages)
{
std::ostringstream chatml_msgs;
for (auto it = messages.begin(); it != messages.end(); ++it) {
chatml_msgs << "<|im_start|>"
<< json_value(*it, "role", std::string("user")) << '\n';
chatml_msgs << json_value(*it, "content", std::string(""))
<< "<|im_end|>\n";
}
chatml_msgs << "<|im_start|>assistant" << '\n';
return chatml_msgs.str();
}
//
// work queue utils
//
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
std::vector<task_multi> queue_multitasks;
std::condition_variable condition_tasks;
// callback functions
std::function<void(task_server&)> callback_new_task;
std::function<void(task_multi&)> callback_finish_multitask;
std::function<void(void)> callback_all_task_finished;
// Add a new task to the end of the queue
int post(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
if (task.id == -1) {
task.id = id++;
}
queue_tasks.push_back(std::move(task));
condition_tasks.notify_one();
return task.id;
}
// Add a new task, but defer until one slot is available
void defer(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
queue_tasks_deferred.push_back(std::move(task));
}
// Get the next id for creating anew task
int get_new_id() {
std::unique_lock<std::mutex> lock(mutex_tasks);
return id++;
}
// Register function to process a new task
void on_new_task(std::function<void(task_server&)> callback) {
callback_new_task = callback;
}
// Register function to process a multitask
void on_finish_multitask(std::function<void(task_multi&)> callback) {
callback_finish_multitask = callback;
}
// Register the function to be called when the batch of tasks is finished
void on_all_tasks_finished(std::function<void(void)> callback) {
callback_all_task_finished = callback;
}
// Call when the state of one slot is changed
void notify_slot_changed() {
// move deferred tasks back to main loop
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : queue_tasks_deferred) {
queue_tasks.push_back(std::move(task));
}
queue_tasks_deferred.clear();
}
// Start the main loop. This call is blocking
[[noreturn]]
void start_loop() {
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
{
while (true)
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
lock.unlock();
break;
}
task_server task = queue_tasks.front();
queue_tasks.erase(queue_tasks.begin());
lock.unlock();
LOG_VERBOSE("callback_new_task", {});
callback_new_task(task);
}
LOG_VERBOSE("callback_all_task_finished", {});
// process and update all the multitasks
auto queue_iterator = queue_multitasks.begin();
while (queue_iterator != queue_multitasks.end())
{
if (queue_iterator->subtasks_remaining.empty())
{
// all subtasks done == multitask is done
task_multi current_multitask = *queue_iterator;
callback_finish_multitask(current_multitask);
// remove this multitask
queue_iterator = queue_multitasks.erase(queue_iterator);
}
else
{
++queue_iterator;
}
}
// all tasks in the current loop is finished
callback_all_task_finished();
}
LOG_VERBOSE("wait for new task", {});
// wait for new task
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
condition_tasks.wait(lock, [&]{
return !queue_tasks.empty();
});
}
}
}
}
//
// functions to manage multitasks
//
// add a multitask by specifying the id of all subtask (subtask is a task_server)
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
task_multi multi;
multi.id = multitask_id;
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
queue_multitasks.push_back(multi);
}
// updatethe remaining subtasks, while appending results to multitask
void update_multitask(int multitask_id, int subtask_id, task_result& result)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
for (auto& multitask : queue_multitasks)
{
if (multitask.id == multitask_id)
{
multitask.subtasks_remaining.erase(subtask_id);
multitask.results.push_back(result);
}
}
}
};
struct llama_server_response {
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
callback_multitask_t callback_update_multitask;
// for keeping track of all tasks waiting for the result
std::set<int> waiting_task_ids;
// the main result queue
std::vector<task_result> queue_results;
std::mutex mutex_results;
std::condition_variable condition_results;
void add_waiting_task_id(int task_id) {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(task_id);
}
void remove_waiting_task_id(int task_id) {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(task_id);
}
// This function blocks the thread until there is a response for this task_id
task_result recv(int task_id) {
while (true)
{
std::unique_lock<std::mutex> lock(mutex_results);
condition_results.wait(lock, [&]{
return !queue_results.empty();
});
LOG_VERBOSE("condition_results unblock", {});
for (int i = 0; i < (int) queue_results.size(); i++)
{
if (queue_results[i].id == task_id)
{
assert(queue_results[i].multitask_id == -1);
task_result res = queue_results[i];
queue_results.erase(queue_results.begin() + i);
return res;
}
}
}
// should never reach here
}
// Register the function to update multitask
void on_multitask_update(callback_multitask_t callback) {
callback_update_multitask = callback;
}
// Send a new result to a waiting task_id
void send(task_result result) {
std::unique_lock<std::mutex> lock(mutex_results);
LOG_VERBOSE("send new result", {});
for (auto& task_id : waiting_task_ids) {
// LOG_TEE("waiting task id %i \n", task_id);
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
if (result.multitask_id == task_id)
{
LOG_VERBOSE("callback_update_multitask", {});
callback_update_multitask(task_id, result.id, result);
continue;
}
if (result.id == task_id)
{
LOG_VERBOSE("queue_results.push_back", {});
queue_results.push_back(result);
condition_results.notify_one();
return;
}
}
}
};
//
// base64 utils (TODO: move to common in the future)
//
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static inline bool is_base64(uint8_t c)
{
return (isalnum(c) || (c == '+') || (c == '/'));
}
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
{
int i = 0;
int j = 0;
int in_ = 0;
int in_len = encoded_string.size();
uint8_t char_array_4[4];
uint8_t char_array_3[3];
std::vector<uint8_t> ret;
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
{
char_array_4[i++] = encoded_string[in_]; in_++;
if (i == 4)
{
for (i = 0; i <4; i++)
{
char_array_4[i] = base64_chars.find(char_array_4[i]);
}
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++)
{
ret.push_back(char_array_3[i]);
}
i = 0;
}
}
if (i)
{
for (j = i; j <4; j++)
{
char_array_4[j] = 0;
}
for (j = 0; j <4; j++)
{
char_array_4[j] = base64_chars.find(char_array_4[j]);
}
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (j = 0; (j < i - 1); j++)
{
ret.push_back(char_array_3[j]);
}
}
return ret;
}
//
// random string / id
//
static std::string random_string()
{
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
std::random_device rd;
std::mt19937 generator(rd());
std::string result(32, ' ');
for (int i = 0; i < 32; ++i) {
result[i] = str[generator() % str.size()];
}
return result;
}
static std::string gen_chatcmplid()
{
std::stringstream chatcmplid;
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
}

View file

@ -0,0 +1,9 @@
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
set(TARGET ls-sycl-device)
add_executable(${TARGET} ls-sycl-device.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

47
examples/sycl/README.md Normal file
View file

@ -0,0 +1,47 @@
# llama.cpp/example/sycl
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
## Tool
|Tool Name| Function|Status|
|-|-|-|
|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
### ls-sycl-device
List all SYCL devices with ID, compute capability, max work group size, ect.
1. Build the llama.cpp for SYCL for all targets.
2. Enable oneAPI running environment
```
source /opt/intel/oneapi/setvars.sh
```
3. Execute
```
./build/bin/ls-sycl-device
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|

20
examples/sycl/build.sh Executable file
View file

@ -0,0 +1,20 @@
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
#for FP32
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
#cmake --build . --config Release --target main
#build all binary
cmake --build . --config Release -v

View file

@ -0,0 +1,11 @@
/*MIT license
Copyright (C) 2024 Intel Corporation
SPDX-License-Identifier: MIT
*/
#include "ggml-sycl.h"
int main(int argc, char ** argv) {
ggml_backend_sycl_print_sycl_devices();
return 0;
}

19
examples/sycl/run-llama2.sh Executable file
View file

@ -0,0 +1,19 @@
#!/bin/bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
source /opt/intel/oneapi/setvars.sh
if [ $# -gt 0 ]; then
export GGML_SYCL_DEVICE=$1
else
export GGML_SYCL_DEVICE=0
fi
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
#export GGML_SYCL_DEBUG=1
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0

6
flake.lock generated
View file

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1705133751, "lastModified": 1706191920,
"narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=", "narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d", "rev": "ae5c332cbb5827f6b1f02572496b141021de335f",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -1,3 +1,17 @@
# The flake interface to llama.cpp's Nix expressions. The flake is used as a
# more discoverable entry-point, as well as a way to pin the dependencies and
# expose default outputs, including the outputs built by the CI.
# For more serious applications involving some kind of customization you may
# want to consider consuming the overlay, or instantiating `llamaPackages`
# directly:
#
# ```nix
# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
# ```
# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
# of the relation between Nix and the Nix Flakes.
{ {
description = "Port of Facebook's LLaMA model in C/C++"; description = "Port of Facebook's LLaMA model in C/C++";

View file

@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
if (block->size >= size) { if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1; best_fit_block = alloc->n_free_blocks - 1;
} else { } else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
__func__, size, max_avail); __func__, tensor->name, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer"); GGML_ASSERT(!"not enough space in the buffer");
return; return;
} }
@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
} }
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) { size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
return alloc->max_size; // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
// to avoid this, we add a 10% margin to the buffer size
return alloc->max_size + alloc->max_size/10;
} }
// graph allocator // graph allocator
@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
} }
// utils // utils
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
size_t alignment = ggml_backend_buft_get_alignment(buft); static bool alloc_tensor_range(struct ggml_context * ctx,
struct ggml_tensor * first, struct ggml_tensor * last,
size_t nbytes = 0; ggml_backend_buffer_type_t buft, size_t size,
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
if (t->data == NULL && t->view_src == NULL) { ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
}
if (nbytes == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
if (buffer == NULL) { if (buffer == NULL) {
// failed to allocate buffer
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate buffer\n", __func__); fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif #endif
return NULL; for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free(*buffers[i]);
}
free(buffers);
return false;
} }
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) { if (t->data == NULL) {
if (t->view_src == NULL) { if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t); ggml_tallocr_alloc(tallocr, t);
@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
ggml_tallocr_free(tallocr); ggml_tallocr_free(tallocr);
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
(*buffers)[(*n_buffers)++] = buffer;
return true;
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
size_t alignment = ggml_backend_buft_get_alignment(buft);
size_t max_size = ggml_backend_buft_get_max_size(buft);
ggml_backend_buffer_t * buffers = NULL;
size_t n_buffers = 0;
size_t cur_buf_size = 0;
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size_t this_size = 0;
if (t->data == NULL && t->view_src == NULL) {
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
if (this_size > max_size) {
// tensor is too large to fit in a single buffer
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
this_size, max_size);
for (size_t i = 0; i < n_buffers; i++) {
ggml_backend_buffer_free(buffers[i]);
}
free(buffers);
return NULL;
}
if ((cur_buf_size + this_size) > max_size) {
// allocate tensors in the current buffer
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
first = t;
cur_buf_size = this_size;
} else {
cur_buf_size += this_size;
}
}
// allocate remaining tensors
if (cur_buf_size > 0) {
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
}
if (n_buffers == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_buffer_t buffer;
if (n_buffers == 1) {
buffer = buffers[0];
} else {
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
}
free(buffers);
return buffer; return buffer;
} }

View file

@ -19,6 +19,7 @@ extern "C" {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory // check if tensor data is in host memory
@ -63,6 +64,11 @@ extern "C" {
// do not use directly, use ggml_backend_tensor_copy instead // do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// buffer that contains a collection of buffers
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
// //
// Backend // Backend
// //

View file

@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
return buft->iface.get_alignment(buft); return buft->iface.get_alignment(buft);
} }
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
// get_max_size is optional, defaults to SIZE_MAX
if (buft->iface.get_max_size) {
return buft->iface.get_max_size(buft);
}
return SIZE_MAX;
}
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes // get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) { if (buft->iface.get_alloc_size) {
return buft->iface.get_alloc_size(buft, tensor); size_t size = buft->iface.get_alloc_size(buft, tensor);
assert(size >= ggml_nbytes(tensor));
return size;
} }
return ggml_nbytes(tensor); return ggml_nbytes(tensor);
} }
@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
size_t size) { size_t size) {
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
GGML_ASSERT(iface.get_base != NULL);
(*buffer) = (struct ggml_backend_buffer) { (*buffer) = (struct ggml_backend_buffer) {
/* .interface = */ iface, /* .interface = */ iface,
/* .buft = */ buft, /* .buft = */ buft,
@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
} }
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
}
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
} }
@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
buffer->usage = usage; buffer->usage = usage;
// FIXME: add a generic callback to the buffer interface
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
ggml_backend_multi_buffer_set_usage(buffer, usage);
}
} }
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)); return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
} }
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
}
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@ -337,11 +358,21 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_cuda_reg_devices(); ggml_backend_cuda_reg_devices();
#endif #endif
#ifdef GGML_USE_SYCL
extern void ggml_backend_sycl_reg_devices(void);
ggml_backend_sycl_reg_devices();
#endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
#endif #endif
#ifdef GGML_USE_VULKAN
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
ggml_backend_vk_reg_devices();
#endif
} }
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@ -545,6 +576,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@ -600,6 +632,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@ -756,6 +789,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
GGML_UNUSED(user_data); GGML_UNUSED(user_data);
} }
// multi-buffer buffer
struct ggml_backend_multi_buffer_context {
ggml_backend_buffer_t * buffers;
size_t n_buffers;
};
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
}
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_free(ctx->buffers[i]);
}
free(ctx->buffers);
free(ctx);
}
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_clear(ctx->buffers[i], value);
}
}
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
/* .get_name = */ ggml_backend_multi_buffer_get_name,
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
/* .get_base = */ NULL,
/* .init_tensor = */ NULL,
/* .set_tensor = */ NULL,
/* .get_tensor = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_multi_buffer_clear,
/* .reset = */ NULL,
};
return multi_backend_buffer_i;
}
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
ctx->n_buffers = n_buffers;
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
size_t total_size = 0;
for (size_t i = 0; i < n_buffers; i++) {
ctx->buffers[i] = buffers[i];
total_size += ggml_backend_buffer_get_size(buffers[i]);
}
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
}
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
}
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
}
}
// scheduler // scheduler
@ -1191,6 +1298,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
ggml_tallocr_t src_allocr = node_allocr(src); ggml_tallocr_t src_allocr = node_allocr(src);
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
if (src_allocr != node_allocr) { if (src_allocr != node_allocr) {
// create a copy of the input in the split's backend
size_t id = hash_id(src);
if (sched->node_copies[id][cur_backend_id] == NULL) {
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
sched->node_copies[id][cur_backend_id] = tensor_copy;
node_allocr(tensor_copy) = cur_allocr;
SET_CAUSE(tensor_copy, "4.cpy");
int n_inputs = sched->splits[cur_split].n_inputs++;
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src;
}
node->src[j] = sched->node_copies[id][cur_backend_id];
#if 0
// check if the input is already in the split // check if the input is already in the split
bool found = false; bool found = false;
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@ -1206,19 +1331,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src; sched->splits[cur_split].inputs[n_inputs] = src;
} }
#endif
// create a copy of the input in the split's backend
size_t id = hash_id(src);
if (sched->node_copies[id][cur_backend_id] == NULL) {
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
sched->node_copies[id][cur_backend_id] = tensor_copy;
node_allocr(tensor_copy) = cur_allocr;
SET_CAUSE(tensor_copy, "4.cpy");
}
node->src[j] = sched->node_copies[id][cur_backend_id];
} }
} }
} }

View file

@ -20,6 +20,7 @@ extern "C" {
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
@ -36,6 +37,7 @@ extern "C" {
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
@ -54,6 +56,7 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);

View file

@ -13,6 +13,10 @@
#include <map> #include <map>
#include <array> #include <array>
// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#include <hipblas/hipblas.h> #include <hipblas/hipblas.h>
@ -585,13 +589,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
[[noreturn]] [[noreturn]]
static __device__ void bad_arch() { static __device__ void no_device_code(
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n"); const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch);
(void) arch_list;
#else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap(); __trap();
(void) bad_arch; // suppress unused function warning (void) no_device_code; // suppress unused function warning
} }
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) { static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll #pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) { for (int mask = 16; mask > 0; mask >>= 1) {
@ -618,7 +637,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
return a; return a;
#else #else
(void) a; (void) a;
bad_arch(); NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
} }
@ -647,7 +666,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
return x; return x;
#else #else
(void) x; (void) x;
bad_arch(); NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
} }
@ -2444,7 +2463,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
} }
#else #else
(void) vx; (void) y; (void) k; (void) vx; (void) y; (void) k;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_PASCAL #endif // __CUDA_ARCH__ >= CC_PASCAL
} }
@ -2475,7 +2494,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
// second part effectively subtracts 8 from each quant value // second part effectively subtracts 8 from each quant value
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2512,7 +2531,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2547,7 +2566,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
// second part effectively subtracts 16 from each quant value // second part effectively subtracts 16 from each quant value
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2592,7 +2611,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
return sumi*d5d8 + m5s8 / (QI5_1 / vdr); return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2613,7 +2632,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
return d8_0*d8_1 * sumi; return d8_0*d8_1 * sumi;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2643,7 +2662,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return sumi*d8d8 + m8s8 / (QI8_1 / vdr); return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2678,7 +2697,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
return dm2f.x*sumf_d - dm2f.y*sumf_m; return dm2f.x*sumf_d - dm2f.y*sumf_m;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2715,7 +2734,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2755,7 +2774,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
return d3 * sumf; return d3 * sumf;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2780,7 +2799,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
return d3*d8 * sumi; return d3*d8 * sumi;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2813,7 +2832,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
return dm4f.x*sumf_d - dm4f.y*sumf_m; return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2846,7 +2865,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
return dm4f.x*sumf_d - dm4f.y*sumf_m; return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2886,7 +2905,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
return dm5f.x*sumf_d - dm5f.y*sumf_m; return dm5f.x*sumf_d - dm5f.y*sumf_m;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2919,7 +2938,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
return dm4f.x*sumf_d - dm4f.y*sumf_m; return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2949,7 +2968,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
return d*sumf; return d*sumf;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -2980,7 +2999,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
return d6 * sumf_d; return d6 * sumf_d;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
} }
@ -3846,7 +3865,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
return dall * sumf_d - dmin * sumf_m; return dall * sumf_d - dmin * sumf_m;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif #endif
@ -4029,7 +4048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
return d * sumf_d; return d * sumf_d;
#else #else
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif #endif
@ -4287,7 +4306,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
q8 += 8; q8 += 8;
aux32 >>= 7; aux32 >>= 7;
} }
const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f; const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
return d * sumi; return d * sumi;
#else #else
// iqs is 0...15 // iqs is 0...15
@ -4298,7 +4317,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
const uint32_t aux32 = q2[2] | (q2[3] << 16); const uint32_t aux32 = q2[2] | (q2[3] << 16);
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f; const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127]; const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
const int8_t * q8 = bq8_1[ib32].qs + 16*il; const int8_t * q8 = bq8_1[ib32].qs + 16*il;
@ -4343,7 +4362,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
} }
q8 += 8; q8 += 8;
} }
const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f; const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
#else #else
assert(false); assert(false);
@ -4524,7 +4543,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q4_0_q8_1_mul_mat; (void) vec_dot_q4_0_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -4593,7 +4612,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q4_1_q8_1_mul_mat; (void) vec_dot_q4_1_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -4660,7 +4679,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q5_0_q8_1_mul_mat; (void) vec_dot_q5_0_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -4727,7 +4746,7 @@ mul_mat_q5_1(
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q5_1_q8_1_mul_mat; (void) vec_dot_q5_1_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -4794,7 +4813,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q8_0_q8_1_mul_mat; (void) vec_dot_q8_0_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -4861,7 +4880,7 @@ mul_mat_q2_K(
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q2_K_q8_1_mul_mat; (void) vec_dot_q2_K_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -4930,7 +4949,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q3_K_q8_1_mul_mat; (void) vec_dot_q3_K_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -4999,7 +5018,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q4_K_q8_1_mul_mat; (void) vec_dot_q4_K_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -5066,7 +5085,7 @@ mul_mat_q5_K(
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q5_K_q8_1_mul_mat; (void) vec_dot_q5_K_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -5135,7 +5154,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else #else
(void) vec_dot_q6_K_q8_1_mul_mat; (void) vec_dot_q6_K_q8_1_mul_mat;
bad_arch(); NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
} }
@ -5858,7 +5877,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
} }
#else #else
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale; (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
bad_arch(); NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
} }
@ -10225,8 +10244,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
// TODO: mmq/mmv support // TODO: mmq/mmv support
#endif #endif
const int64_t nb11 = src1->nb[1]; const size_t nb11 = src1->nb[1];
const int64_t nb1 = dst->nb[1]; const size_t nb1 = dst->nb[1];
const struct ggml_tensor * ids = src0; const struct ggml_tensor * ids = src0;
const int32_t id = ((int32_t *) dst->op_params)[0]; const int32_t id = ((int32_t *) dst->op_params)[0];
@ -10887,15 +10906,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
if (ggml_is_quantized(tensor->type)) { if (ggml_is_quantized(tensor->type)) {
// initialize padding to 0 to avoid possible NaN values // initialize padding to 0 to avoid possible NaN values
int64_t row_low = 0; size_t original_size = ggml_nbytes(tensor);
int64_t row_high = ggml_nrows(tensor);
int64_t nrows_split = row_high - row_low;
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
if (padded_size > original_size && tensor->view_src == nullptr) { if (padded_size > original_size && tensor->view_src == nullptr) {
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0])); CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
} }
} }
} }
@ -10998,12 +11013,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
} }
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
int64_t row_low = 0; size_t size = ggml_nbytes(tensor);
int64_t row_high = ggml_nrows(tensor);
int64_t nrows_split = row_high - row_low;
size_t size = ggml_nbytes_split(tensor, nrows_split);
int64_t ne0 = tensor->ne[0]; int64_t ne0 = tensor->ne[0];
if (ggml_is_quantized(tensor->type)) { if (ggml_is_quantized(tensor->type)) {
@ -11032,6 +11042,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .get_name = */ ggml_backend_cuda_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
@ -11307,6 +11318,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name, /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
@ -11386,6 +11398,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name, /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,

View file

@ -24,19 +24,7 @@
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#define GGML_METAL_MAX_KERNELS 256
struct ggml_metal_buffer {
const char * name;
void * data;
size_t size;
id<MTLBuffer> metal;
};
struct ggml_metal_kernel { struct ggml_metal_kernel {
id<MTLFunction> function;
id<MTLComputePipelineState> pipeline; id<MTLComputePipelineState> pipeline;
}; };
@ -149,7 +137,10 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
@ -171,14 +162,10 @@ struct ggml_metal_context {
id<MTLDevice> device; id<MTLDevice> device;
id<MTLCommandQueue> queue; id<MTLCommandQueue> queue;
id<MTLLibrary> library;
dispatch_queue_t d_queue; dispatch_queue_t d_queue;
int n_buffers; struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
bool support_simdgroup_reduction; bool support_simdgroup_reduction;
bool support_simdgroup_mm; bool support_simdgroup_mm;
@ -245,26 +232,24 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
// Show all the Metal device instances in the system // Show all the Metal device instances in the system
NSArray * devices = MTLCopyAllDevices(); NSArray * devices = MTLCopyAllDevices();
for (id<MTLDevice> device in devices) { for (id<MTLDevice> device in devices) {
NSString * s = [device name]; GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
} }
[devices release]; // since it was created by a *Copy* C method [devices release]; // since it was created by a *Copy* C method
#endif #endif
// Pick and show default Metal device // Pick and show default Metal device
id<MTLDevice> device = MTLCreateSystemDefaultDevice(); id<MTLDevice> device = MTLCreateSystemDefaultDevice();
NSString * s = [device name]; GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
// Configure context // Configure context
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
ctx->device = device; ctx->device = device;
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
ctx->queue = [ctx->device newCommandQueue]; ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0;
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
id<MTLLibrary> metal_library;
// load library // load library
{ {
NSBundle * bundle = nil; NSBundle * bundle = nil;
@ -279,7 +264,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
// pre-compiled library found // pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:libPath]; NSURL * libURL = [NSURL fileURLWithPath:libPath];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) { if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
@ -321,7 +306,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
//[options setFastMathEnabled:false]; //[options setFastMathEnabled:false];
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
if (error) { if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
@ -386,8 +371,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
{ {
NSError * error = nil; NSError * error = nil;
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) { for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
ctx->kernels[i].function = nil;
ctx->kernels[i].pipeline = nil; ctx->kernels[i].pipeline = nil;
} }
@ -399,13 +383,15 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
#define GGML_METAL_ADD_KERNEL(e, name, supported) \ #define GGML_METAL_ADD_KERNEL(e, name, supported) \
if (supported) { \ if (supported) { \
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \ struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \ id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \ kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
[metal_function release]; \
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
(int) kernel->pipeline.threadExecutionWidth); \ (int) kernel->pipeline.threadExecutionWidth); \
if (error) { \ if (error) { \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
[metal_library release]; \
return NULL; \ return NULL; \
} \ } \
} else { \ } else { \
@ -522,7 +508,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
@ -537,27 +526,17 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
} }
[metal_library release];
return ctx; return ctx;
} }
static void ggml_metal_free(struct ggml_metal_context * ctx) { static void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
for (int i = 0; i < ctx->n_buffers; ++i) { for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
[ctx->buffers[i].metal release];
}
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
if (ctx->kernels[i].pipeline) {
[ctx->kernels[i].pipeline release]; [ctx->kernels[i].pipeline release];
} }
if (ctx->kernels[i].function) {
[ctx->kernels[i].function release];
}
}
[ctx->library release];
[ctx->queue release]; [ctx->queue release];
[ctx->device release]; [ctx->device release];
@ -589,15 +568,13 @@ struct ggml_backend_metal_buffer_context {
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer // Metal buffer based on the host memory pointer
// //
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
//GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
const int64_t tsize = ggml_nbytes(t); const int64_t tsize = ggml_nbytes(t);
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
// compatibility with ggml-backend
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context; struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
// find the view that contains the tensor fully // find the view that contains the tensor fully
@ -619,25 +596,6 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
return nil; return nil;
} }
// find the view that contains the tensor fully
for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs;
//GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
return ctx->buffers[i].metal;
}
}
GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
return nil;
}
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) { static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
switch (op->op) { switch (op->op) {
case GGML_OP_UNARY: case GGML_OP_UNARY:
@ -681,7 +639,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
return true; return true;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
return ctx->support_simdgroup_reduction; return ctx->support_simdgroup_reduction &&
(op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_CONT: case GGML_OP_CONT:
@ -826,9 +785,9 @@ static bool ggml_metal_graph_compute(
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
//if (src0) { //if (src0) {
@ -1610,7 +1569,7 @@ static bool ggml_metal_graph_compute(
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
size_t offs_src_cur = 0; size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j]; [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
} }
@ -1755,7 +1714,7 @@ static bool ggml_metal_graph_compute(
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
size_t offs_src_cur = 0; size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
} }
@ -2189,9 +2148,9 @@ static bool ggml_metal_graph_compute(
size_t offs_src3 = 0; size_t offs_src3 = 0;
GGML_ASSERT(src2); GGML_ASSERT(src2);
id<MTLBuffer> id_src2 = ggml_metal_get_buffer(ctx, src2, &offs_src2); id<MTLBuffer> id_src2 = ggml_metal_get_buffer(src2, &offs_src2);
id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(ctx, src3, &offs_src3) : nil; id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
const int64_t ne31 = src3 ? src3->ne[1] : 0; const int64_t ne31 = src3 ? src3->ne[1] : 0;
@ -2213,7 +2172,10 @@ static bool ggml_metal_graph_compute(
switch (ne00) { switch (ne00) {
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
default: default:
{ {
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
@ -2253,14 +2215,17 @@ static bool ggml_metal_graph_compute(
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26]; [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26];
[encoder setBytes:&scale length:sizeof( float) atIndex:27]; [encoder setBytes:&scale length:sizeof( float) atIndex:27];
// for small batches use more simdgroups (needs more tests, to confirm if it's worth it)
const int64_t nsg = ne01 < 4 ? 12 : 4; // simdgroups per threadgroup (a.k.a. warps)
const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !! const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !!
const int64_t ncpsg = 32; // cache values per simdgroup (does not work for other values) const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
//const size_t smem = nqptg*(nhptg*ne00 + nsg*(nhptg*ne00 + 256))*(sizeof(float)/2); GGML_ASSERT(nqptg % 8 == 0);
const size_t smem = nqptg*(ne00 + nsg*(ne00 + 1*ncpsg))*(sizeof(float)/2); GGML_ASSERT(ncpsg % 32 == 0);
// simdgroups per threadgroup (a.k.a. warps)
// for small batches use more simdgroups (needs more tests, to confirm if it's worth it)
const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)) : 4;
const size_t smem = nqptg*(ne00 + nsg*(ncpsg + nqptg))*(sizeof(float)/2);
//printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
@ -2465,10 +2430,13 @@ GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backe
UNUSED(buft); UNUSED(buft);
} }
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) { static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
#ifndef GGML_METAL_NDEBUG
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
if (@available(macOS 10.12, iOS 16.0, *)) { if (@available(macOS 10.12, iOS 16.0, *)) {
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
__func__,
size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0,
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@ -2478,10 +2446,15 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
GGML_METAL_LOG_INFO("\n"); GGML_METAL_LOG_INFO("\n");
} }
} else { } else {
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
__func__,
size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0);
} }
#endif
#endif #endif
UNUSED(device); UNUSED(device);
UNUSED(size_aligned);
} }
GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -2515,8 +2488,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
return NULL; return NULL;
} }
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); ggml_backend_metal_log_allocated_size(device, size_aligned);
ggml_backend_metal_log_allocated_size(device);
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
} }
@ -2544,6 +2516,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_name = */ ggml_backend_metal_buffer_type_get_name, /* .get_name = */ ggml_backend_metal_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host, /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
@ -2592,7 +2565,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
return false; return false;
} }
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); ggml_backend_metal_log_allocated_size(device, size_aligned);
++ctx->n_buffers; ++ctx->n_buffers;
} else { } else {
@ -2615,7 +2588,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
return false; return false;
} }
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i); ggml_backend_metal_log_allocated_size(device, size_step_aligned);
if (i + size_step < size) { if (i + size_step < size) {
GGML_METAL_LOG_INFO("\n"); GGML_METAL_LOG_INFO("\n");
} }
@ -2624,8 +2598,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
} }
} }
ggml_backend_metal_log_allocated_size(device);
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
} }

View file

@ -1995,6 +1995,7 @@ typedef void (flash_attn_ext_f16_t)(
uint tiisg[[thread_index_in_simdgroup]], uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]); uint sgitg[[simdgroup_index_in_threadgroup]]);
// ref: https://arxiv.org/pdf/2307.08691.pdf
template<int64_t D, int64_t Q, int64_t C> // head size, queries per threadgroup, cache items per threadgroup template<int64_t D, int64_t Q, int64_t C> // head size, queries per threadgroup, cache items per threadgroup
kernel void kernel_flash_attn_ext_f16( kernel void kernel_flash_attn_ext_f16(
device const char * q, device const char * q,
@ -2038,39 +2039,45 @@ kernel void kernel_flash_attn_ext_f16(
const int64_t iq1 = tgpig[0]*Q; const int64_t iq1 = tgpig[0]*Q;
const int64_t D4 = D/4; const int64_t D4 = D/4;
const int64_t N4 = N_SIMDWIDTH;
const int64_t L4 = (D4 + N4 - 1)/N4;
const int64_t D8 = D/8; const int64_t D8 = D/8;
const int64_t Q8 = Q/8;
const int64_t NW = N_SIMDWIDTH;
const int64_t SH = (C + Q); // shared memory per simdgroup in (half)
const int64_t T = D + nsg*(D + 1*C); // shared memory size per query in half const int64_t T = D + nsg*SH; // shared memory size per query in (half)
const int64_t T4 = T/4; // shared memory size per query in half4 const int64_t T4 = T/4; // shared memory size per query in (half4)
threadgroup half * pq = (threadgroup half *) (shared + 0*D); threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
threadgroup half4 * pq4 = (threadgroup half4 *) (shared + 0*D); threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
threadgroup half * ps = (threadgroup half *) (shared + sgitg*(D + 1*C) + 1*D); threadgroup half * ss = (threadgroup half *) (shared + sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
threadgroup half4 * ps4 = (threadgroup half4 *) (shared + sgitg*(D + 1*C) + 1*D);
threadgroup half * ss = (threadgroup half *) (shared + sgitg*(D + 1*C) + 2*D); // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
simdgroup_half8x8 lo[Q8][D8];
for (int64_t i = 0; i < L4; ++i) {
// load heads from Q to shared memory // load heads from Q to shared memory
for (int64_t j = sgitg; j < Q; j += nsg) { for (int64_t j = sgitg; j < Q; j += nsg) {
device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03)); device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
for (int64_t i = tiisg; i < D4; i += NW) {
if (iq1 + j < ne01) { if (iq1 + j < ne01) {
pq4[j*T4 + N4*i + tiisg] = (half4) q4[N4*i + tiisg]; sq4[j*T4 + i] = (half4) q4[i];
} else { } else {
pq4[j*T4 + N4*i + tiisg] = 0.0h; sq4[j*T4 + i] = 0.0h;
}
} }
} }
// zero out shared memory // zero out lo
for (int64_t j = 0; j < Q; ++j) { for (int64_t j = 0; j < Q8; ++j) {
ps4[j*T4 + N4*i + tiisg] = 0.0h; for (int64_t i = 0; i < D8; ++i) {
lo[j][i] = make_filled_simdgroup_matrix<half, 8>(0.0h);
} }
} }
if (tiisg < C) { // zero out shared memory SH
for (int64_t j = 0; j < Q; ++j) { for (int64_t j = 0; j < Q; ++j) {
ss[j*T + 0 + tiisg] = 0.0h; for (int64_t i = tiisg; i < SH; i += NW) {
ss[j*T + i] = 0.0h;
} }
} }
@ -2103,79 +2110,68 @@ kernel void kernel_flash_attn_ext_f16(
const int64_t iv2 = iq2 / rv2; const int64_t iv2 = iq2 / rv2;
const int64_t iv3 = iq3 / rv3; const int64_t iv3 = iq3 / rv3;
simdgroup_half8x8 mq[D8]; // load the queries from shared memory into local memory
simdgroup_half8x8 mq[Q8][D8];
for (int64_t j = 0; j < Q8; ++j) {
for (int64_t i = 0; i < D8; ++i) { for (int64_t i = 0; i < D8; ++i) {
simdgroup_load(mq[i], pq + i*8, T); simdgroup_load(mq[j][i], sq + 8*j*T + i*8, T);
}
} }
// TODO: this can be improved
device const float * mp[Q];
{
const int64_t ir = iq3*ne02*ne01 + iq2*ne01 + iq1; const int64_t ir = iq3*ne02*ne01 + iq2*ne01 + iq1;
for (int64_t j = 0; j < Q; ++j) { // pointer to the mask
if (iq1 + j < ne01) { device const float * mp = (device const float *) (mask + (ir%ne31)*nb31);
mp[j] = (device const float *) (mask + ((ir + j)%ne31)*nb31);
} else {
mp[j] = nullptr;
}
}
}
// prepare diagonal scale matrix // prepare diagonal scale matrix
simdgroup_half8x8 mscale(scale); simdgroup_float8x8 mscale(scale);
for (int64_t iic = C*sgitg; iic < ne11; iic += C*nsg) {
// skip -INF blocks
// TODO: double-check this
{
float smc = -INFINITY;
for (int64_t j = 0; j < Q; ++j) {
const float mc = mp[j] ? mp[j][iic + tiisg] : -INFINITY;
smc = simd_max(max(smc, mc));
}
if (smc == -INFINITY) {
continue;
}
}
// loop over the KV cache
// each simdgroup handles blocks of Q rows and C columns
for (int64_t ic = C*sgitg; ic < ne11; ic += C*nsg) {
// Q*K^T // Q*K^T
{ {
simdgroup_half8x8 mk;
for (int cc = 0; cc < C/8; ++cc) { for (int cc = 0; cc < C/8; ++cc) {
simdgroup_half8x8 mqk = make_filled_simdgroup_matrix<half, Q>(0.h); simdgroup_half8x8 mqk[Q8];
for (int64_t j = 0; j < Q8; ++j) {
mqk[j] = make_filled_simdgroup_matrix<half, 8>(0.h);
}
device const half * pk = (device const half *) ((device const char *) k + ((iic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13)); device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
for (int64_t i = 0; i < D8; ++i) { for (int64_t i = 0; i < D8; ++i) {
simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); simdgroup_half8x8 mk;
simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); for (int64_t j = 0; j < Q8; ++j) {
simdgroup_multiply_accumulate(mqk[j], mq[j][i], mk, mqk[j]);
}
} }
// mqk = mqk*scale + mask // mqk = mqk*scale + mask
for (int64_t j = 0; j < Q8; ++j) {
simdgroup_float8x8 mm; simdgroup_float8x8 mm;
simdgroup_load(mm, mp[0] + iic + 8*cc, nb31/sizeof(float), 0, false); simdgroup_load(mm, mp + 8*j*(nb31/sizeof(float)) + ic + 8*cc, nb31/sizeof(float), 0, false);
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm); simdgroup_multiply_accumulate(mqk[j], mqk[j], mscale, mm);
simdgroup_store(mqk, ss + 8*cc, T, 0, false); simdgroup_store(mqk[j], ss + 8*j*T + 8*cc, T, 0, false);
} }
} }
}
// used to detect blocks full of -INF
half smax = -INFINITY;
// online softmax // online softmax
if (C == 32) {
for (int64_t j = 0; j < Q; ++j) { for (int64_t j = 0; j < Q; ++j) {
const int64_t p = tiisg; const int64_t p = tiisg;
//const half s = ss[j*T + p]*scale + (mp[j][iic + p]); const half m = M[j];
const half s = ss[j*T + p]; const half s = ss[j*T + p];
half m = M[j]; smax = simd_max(max(smax, s));
M[j] = simd_max(max(M[j], s)); M[j] = simd_max(max(M[j], s));
const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]); const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]);
@ -2183,42 +2179,83 @@ kernel void kernel_flash_attn_ext_f16(
S[j] = S[j]*ms + simd_sum(vs); S[j] = S[j]*ms + simd_sum(vs);
for (int64_t i = 0; i < L4; ++i) { // create a QxQ diagonal matrix for rescaling the output
ps4[j*T4 + N4*i + tiisg] *= ms; if (p == j) {
ss[j*T + C + j] = ms;
} }
// the P matrix from the paper (Q rows, C columns)
ss[j*T + p] = vs; ss[j*T + p] = vs;
} }
} else {
for (int64_t j = 0; j < Q; ++j) {
const half m = M[j];
simdgroup_barrier(mem_flags::mem_none); for (int64_t p = tiisg; p < C; p += NW) {
const half s = ss[j*T + p];
// (Q*K^T)*V smax = simd_max(max(smax, s));
{ M[j] = simd_max(max(M[j], s));
simdgroup_half8x8 mv;
simdgroup_half8x8 mp[C/8];
for (int cc = 0; cc < C/8; ++cc) {
simdgroup_load(mp[cc], ss + 8*cc, T, 0, false);
} }
const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]);
S[j] = S[j]*ms;
// create a QxQ diagonal matrix for rescaling the output
if (tiisg == j) {
ss[j*T + C + j] = ms;
}
for (int64_t p = tiisg; p < C; p += NW) {
const half s = ss[j*T + p];
const half vs = s == -INFINITY ? 0.0h : exp(s - M[j]);
S[j] = S[j] + simd_sum(vs);
// the P matrix from the paper (Q rows, C columns)
ss[j*T + p] = vs;
}
}
}
// skip -INF blocks
if (smax == -INFINITY) {
continue;
}
// O = diag(ms)*O
for (int64_t j = 0; j < Q8; ++j) {
simdgroup_half8x8 mm;
simdgroup_load(mm, ss + 8*j*T + C + 8*j, T, 0, false);
for (int64_t i = 0; i < D8; ++i) { for (int64_t i = 0; i < D8; ++i) {
simdgroup_half8x8 mqkv; simdgroup_multiply(lo[j][i], mm, lo[j][i]);
}
simdgroup_load(mqkv, ps + i*8, T, 0, false); }
// O = O + (Q*K^T)*V
{
for (int cc = 0; cc < C/8; ++cc) { for (int cc = 0; cc < C/8; ++cc) {
device const half * pv = (device const half *) ((device const char *) v + ((iic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23)); device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
simdgroup_load(mv, pv + i*8, nb21/sizeof(half), 0, false); for (int64_t i = 0; i < D8; ++i) {
simdgroup_half8x8 mk;
simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false);
simdgroup_multiply_accumulate(mqkv, mp[cc], mv, mqkv); for (int64_t j = 0; j < Q8; ++j) {
simdgroup_half8x8 mv;
simdgroup_load(mv, ss + 8*j*T + 8*cc, T, 0, false);
simdgroup_multiply_accumulate(lo[j][i], mv, mk, lo[j][i]);
}
} }
simdgroup_store(mqkv, ps + i*8, T, 0, false);
} }
} }
} }
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
for (int64_t j = 0; j < Q; ++j) { for (int64_t j = 0; j < Q; ++j) {
if (tiisg == 0) { if (tiisg == 0) {
ss[j*T + 0] = S[j]; ss[j*T + 0] = S[j];
@ -2227,91 +2264,86 @@ kernel void kernel_flash_attn_ext_f16(
} }
} }
threadgroup_barrier(mem_flags::mem_threadgroup); // reduce the warps sequentially
// reduce the warps
#if 1
if (sgitg == 0) {
half S = { 0.0h };
half M = { -INFINITY };
for (int64_t sg = 1; sg < nsg; ++sg) { for (int64_t sg = 1; sg < nsg; ++sg) {
for (int64_t j = 0; j < Q; ++j) {
const half S0 = ss[j*T + 0];
const half S1 = ss[j*T + sg*(D + 1*C) + 0];
const half M0 = ss[j*T + 1];
const half M1 = ss[j*T + sg*(D + 1*C) + 1];
M = max(M0, M1);
const half ms0 = exp(M0 - M);
const half ms1 = exp(M1 - M);
S = S0*ms0 + S1*ms1;
if (tiisg == 0) {
ss[j*T + 0] = S;
ss[j*T + 1] = M;
}
for (int64_t i = 0; i < L4; ++i) {
ps4[j*T4 + N4*i + tiisg] = ps4[j*T4 + N4*i + tiisg]*ms0 + ps4[j*T4 + sg*(D + 1*C)/4 + N4*i + tiisg]*ms1;
}
}
}
}
#else
// parallel reduce
// NOTE: this is significantly slower than the serial version above, likely due to the small number of warps
{
half S = { 0.0h }; half S = { 0.0h };
half M = { -INFINITY }; half M = { -INFINITY };
for (int64_t sg = nsg/2; sg > 0; sg /= 2) { threadgroup_barrier(mem_flags::mem_threadgroup);
if (sgitg >= sg) {
continue; // each simdgroup stores its output to shared memory, reusing sq
if (sgitg == sg) {
for (int64_t j = 0; j < Q8; ++j) {
for (int64_t i = 0; i < D8; ++i) {
simdgroup_store(lo[j][i], sq + 8*j*T + i*8, T, 0, false);
} }
for (int64_t j = 0; j < Q; ++j) {
const half S0 = ss[j*T + 0];
const half S1 = ss[j*T + sg*(D + 1*C) + 0];
const half M0 = ss[j*T + 1];
const half M1 = ss[j*T + sg*(D + 1*C) + 1];
M = max(M0, M1);
const half ms0 = exp(M0 - M);
const half ms1 = exp(M1 - M);
S = S0*ms0 + S1*ms1;
if (tiisg == 0) {
ss[j*T + 0] = S;
ss[j*T + 1] = M;
}
for (int64_t i = 0; i < L4; ++i) {
ps4[j*T4 + N4*i + tiisg] = ps4[j*T4 + N4*i + tiisg]*ms0 + ps4[j*T4 + sg*(D + 1*C)/4 + N4*i + tiisg]*ms1;
} }
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
}
}
#endif
simdgroup_barrier(mem_flags::mem_threadgroup); // the first simdgroup accumulates the results from the other simdgroups
if (sgitg == 0) {
for (int64_t j = 0; j < Q; ++j) {
const half S0 = ss[j*T + 0];
const half S1 = ss[j*T + sg*SH + 0];
const half M0 = ss[j*T + 1];
const half M1 = ss[j*T + sg*SH + 1];
M = max(M0, M1);
const half ms0 = M0 == -INFINITY ? 0.0h : exp(M0 - M);
const half ms1 = M1 == -INFINITY ? 0.0h : exp(M1 - M);
S = S0*ms0 + S1*ms1;
if (tiisg == 0) {
ss[j*T + 0] = S;
ss[j*T + 1] = M;
ss[j*T + C + j ] = ms0;
ss[j*T + C + j + sg*SH] = ms1;
}
}
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
for (int64_t j = 0; j < Q8; ++j) {
simdgroup_half8x8 t;
simdgroup_half8x8 ms0;
simdgroup_half8x8 ms1;
simdgroup_load(ms0, ss + 8*j*T + C + 8*j, T, 0, false);
simdgroup_load(ms1, ss + 8*j*T + C + 8*j + sg*SH, T, 0, false);
for (int64_t i = 0; i < D8; ++i) {
simdgroup_load (t, sq + 8*j*T + i*8, T, 0, false);
simdgroup_multiply(t, ms1, t);
simdgroup_multiply_accumulate(lo[j][i], ms0, lo[j][i], t);
}
}
}
}
// store result to shared memory (reuse sq)
if (sgitg == 0) {
for (int64_t j = 0; j < Q8; ++j) {
for (int64_t i = 0; i < D8; ++i) {
simdgroup_store(lo[j][i], sq + 8*j*T + i*8, T, 0, false);
}
}
}
device float4 * dst4 = (device float4 *) dst; device float4 * dst4 = (device float4 *) dst;
// final rescale with 1/S and store to global memory
if (sgitg == 0) { if (sgitg == 0) {
for (int64_t j = 0; j < Q && iq1 + j < ne01; ++j) { for (int64_t j = 0; j < Q && iq1 + j < ne01; ++j) {
const half S = ss[j*T + 0]; const half S = ss[j*T + 0];
for (int64_t i = 0; i < L4; ++i) { for (int64_t i = tiisg; i < D4; i += NW) {
dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + N4*i + tiisg] = (float4) ps4[j*T4 + N4*i + tiisg]/S; dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) sq4[j*T4 + i]/S;
} }
} }
} }
@ -2319,7 +2351,10 @@ kernel void kernel_flash_attn_ext_f16(
template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64, 8, 32>; template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64, 8, 32>;
template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80, 8, 32>; template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80, 8, 32>;
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96, 8, 32>;
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112, 8, 32>;
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128, 8, 32>; template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128, 8, 32>;
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256, 8, 32>;
kernel void kernel_cpy_f16_f16( kernel void kernel_cpy_f16_f16(
device const half * src0, device const half * src0,

View file

@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
dst[row] = tmp[0]; dst[row] = tmp[0];
} }
} }
); );
@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
dst[row] = tmp[0]; dst[row] = tmp[0];
} }
} }
); );
@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
} }
); );
std::string add_template = MULTILINE_QUOTE(
__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
if (i >= get_global_size(0)) {
return;
}
dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
}
);
#define CL_CHECK(err) \ #define CL_CHECK(err) \
do { \ do { \
cl_int err_ = (err); \ cl_int err_ = (err); \
@ -878,6 +890,7 @@ static std::string generate_kernels() {
} }
src << mul_kernel << '\n'; src << mul_kernel << '\n';
} }
src << add_template << '\n';
return src.str(); return src.str();
} }
@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
static cl_kernel mul_f32_cl; static cl_kernel mul_f32_cl;
static cl_kernel add_f32_cl;
static bool fp16_support; static bool fp16_support;
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@ -1100,9 +1114,10 @@ void ggml_cl_init(void) {
char *ext_buffer = (char *)alloca(ext_str_size + 1); char *ext_buffer = (char *)alloca(ext_str_size + 1);
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
// Disabled due to faulty outputs
// Check if ext_buffer contains cl_khr_fp16 // Check if ext_buffer contains cl_khr_fp16
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; fp16_support = false; // strstr(ext_buffer, "cl_khr_fp16") != NULL;
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
cl_context_properties properties[] = { cl_context_properties properties[] = {
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
@ -1150,6 +1165,8 @@ void ggml_cl_init(void) {
// mul kernel // mul kernel
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
} }
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
ggml_cl_mul_f32(src0, src1, dst); ggml_cl_mul_f32(src0, src1, dst);
} }
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
size_t x_size;
size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
cl_event ev;
// copy src0 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
const int64_t i13 = i03%ne13;
const int64_t i12 = i02%ne12;
const int i1 = i13*ne12*ne11 + i12*ne11;
cl_int x_offset = 0;
cl_int y_offset = i1*ne10;
cl_int d_offset = 0;
size_t global = ne00 * ne01;
cl_int ky = ne10 * ne11;
CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
CL_CHECK(clReleaseEvent(ev));
CL_CHECK(clFinish(queue));
// copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
}
}
ggml_cl_pool_free(d_X, x_size);
ggml_cl_pool_free(d_D, d_size);
}
void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cl_add_f32(src0, src1, dst);
}
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
@ -2055,6 +2136,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_type_name, /* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return from device info
/* .get_alloc_size = */ NULL, /* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
@ -2111,6 +2193,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name, /* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,

View file

@ -10,6 +10,7 @@ extern "C" {
GGML_API void ggml_cl_init(void); GGML_API void ggml_cl_init(void);
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

15199
ggml-sycl.cpp Normal file

File diff suppressed because it is too large Load diff

27
ggml-sycl.h Normal file
View file

@ -0,0 +1,27 @@
/*MIT license
Copyright (C) 2024 Intel Corporation
SPDX-License-Identifier: MIT
*/
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_SYCL_MAX_DEVICES 16
#define GGML_SYCL_NAME "SYCL"
GGML_API void ggml_init_sycl(void);
GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
#ifdef __cplusplus
}
#endif

61420
ggml-vulkan-shaders.hpp Normal file

File diff suppressed because it is too large Load diff

5176
ggml-vulkan.cpp Normal file

File diff suppressed because it is too large Load diff

34
ggml-vulkan.h Normal file
View file

@ -0,0 +1,34 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_VK_NAME "Vulkan"
GGML_API void ggml_vk_init(void);
GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
GGML_API void ggml_vk_preallocate_buffers(void);
GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#endif
GGML_API void ggml_vk_graph_cleanup(void);
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void);
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
#ifdef __cplusplus
}
#endif

410
ggml.c
View file

@ -248,6 +248,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
#include "ggml-cuda.h" #include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h" #include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.h"
#endif #endif
// floating point type used to accumulate sums // floating point type used to accumulate sums
@ -1344,12 +1348,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
// leftovers // leftovers
for (int i = np; i < n; ++i) { for (int i = np; i < n; ++i) {
y[i] += GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i])*v); y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
} }
#else #else
// scalar // scalar
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] += GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i])*v); y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
} }
#endif #endif
} }
@ -1478,6 +1482,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
// TODO: optimize performance
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
static const float GELU_COEF_A = 0.044715f; static const float GELU_COEF_A = 0.044715f;
static const float GELU_QUICK_COEF = -1.702f; static const float GELU_QUICK_COEF = -1.702f;
@ -1838,9 +1845,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
"GELU", "GELU",
"GELU_QUICK", "GELU_QUICK",
"SILU", "SILU",
"HARDSWISH",
"HARDSIGMOID",
}; };
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10"); static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@ -2350,6 +2359,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
ggml_init_cublas(); ggml_init_cublas();
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
ggml_cl_init(); ggml_cl_init();
#elif defined(GGML_USE_VULKAN)
ggml_vk_init();
#elif defined(GGML_USE_SYCL)
ggml_init_sycl();
#endif #endif
ggml_setup_op_has_task_pass(); ggml_setup_op_has_task_pass();
@ -4007,6 +4020,20 @@ struct ggml_tensor * ggml_silu_back(
return result; return result;
} }
// ggml hardswish
struct ggml_tensor * ggml_hardswish(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
}
// ggml hardsigmoid
struct ggml_tensor * ggml_hardsigmoid(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
}
// ggml_norm // ggml_norm
static struct ggml_tensor * ggml_norm_impl( static struct ggml_tensor * ggml_norm_impl(
@ -5408,6 +5435,31 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
return result; return result;
} }
// ggml_conv_depthwise
struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1) {
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
struct ggml_tensor * result =
ggml_mul_mat(ctx,
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC1, KH, KW] => [1, OC, 1, KH * KW]
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
return result;
}
// ggml_conv_2d // ggml_conv_2d
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@ -7278,6 +7330,17 @@ static void ggml_compute_forward_add_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
#ifdef GGML_USE_CLBLAST
if (src1->backend == GGML_BACKEND_GPU) {
// TODO: OpenCL kernel support full broadcast
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
if (ith == 0) {
ggml_cl_add(src0, src1, dst);
}
return;
}
#endif
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
@ -7558,7 +7621,12 @@ static void ggml_compute_forward_add(
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
if (src1->type == GGML_TYPE_F32) {
ggml_compute_forward_add_f32(params, src0, src1, dst); ggml_compute_forward_add_f32(params, src0, src1, dst);
}
else {
GGML_ASSERT(false);
}
} break; } break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
@ -7879,6 +7947,9 @@ static void ggml_compute_forward_acc_f32(
bool inplace = (bool) ((int32_t *) dst->op_params)[4]; bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace && (params->type == GGML_TASK_INIT)) { if (!inplace && (params->type == GGML_TASK_INIT)) {
if (params->ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions. // memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase // => do it in INIT phase
memcpy( memcpy(
@ -8067,7 +8138,7 @@ static void ggml_compute_forward_mul_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
#ifdef GGML_USE_CLBLAST #if defined(GGML_USE_CLBLAST)
if (src1->backend == GGML_BACKEND_GPU) { if (src1->backend == GGML_BACKEND_GPU) {
// TODO: OpenCL kernel support full broadcast // TODO: OpenCL kernel support full broadcast
GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
@ -9448,6 +9519,87 @@ static void ggml_compute_forward_silu_back(
} }
} }
static void ggml_compute_forward_hardswish_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardswish_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
(float *) ((char *) src0->data + i*(src0->nb[1])));
}
}
static void ggml_compute_forward_hardswish(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_hardswish_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
static void ggml_compute_forward_hardsigmoid_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardsigmoid_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
(float *) ((char *) src0->data + i*(src0->nb[1])));
}
}
static void ggml_compute_forward_hardsigmoid(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_norm // ggml_compute_forward_norm
static void ggml_compute_forward_norm_f32( static void ggml_compute_forward_norm_f32(
@ -9940,11 +10092,30 @@ static void ggml_compute_forward_mul_mat(
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) { if (ggml_compute_forward_mul_mat_use_blas(dst)) {
if (params->ith != 0) { const int64_t ne_plane = ne01*ne00;
return; const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
} UNUSED(desired_wsize);
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (type != GGML_TYPE_F32) {
assert(params->wsize >= desired_wsize);
// parallelize by src0 rows
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
}
}
}
}
return; return;
} }
@ -9952,9 +10123,14 @@ static void ggml_compute_forward_mul_mat(
return; return;
} }
// perform sgemm, parallelization controlled by blas lib
if (ith != 0) {
return;
}
//const int64_t tgemm0 = ggml_perf_time_us();
for (int64_t i13 = 0; i13 < ne13; i13++) { for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) { for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3; const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2; const int64_t i02 = i12/r2;
@ -9963,17 +10139,7 @@ static void ggml_compute_forward_mul_mat(
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) { if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata; x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
to_float((const char *) x + i01*nb01, wdata + id, ne00);
id += ne00;
}
assert(id*sizeof(float) <= params->wsize);
x = wdata;
} }
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@ -9983,6 +10149,7 @@ static void ggml_compute_forward_mul_mat(
0.0f, d, ne01); 0.0f, d, ne01);
} }
} }
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@ -9991,6 +10158,9 @@ static void ggml_compute_forward_mul_mat(
#endif #endif
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
char * wdata = params->wdata; char * wdata = params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10155,6 +10325,9 @@ static void ggml_compute_forward_mul_mat_id(
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
char * wdata = params->wdata; char * wdata = params->wdata;
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10340,6 +10513,9 @@ static void ggml_compute_forward_out_prod_f32(
return; return;
} }
#endif #endif
if (ith != 0) {
return;
}
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return; return;
} }
@ -10523,6 +10699,9 @@ static void ggml_compute_forward_out_prod_q_f32(
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return; return;
} }
@ -10707,6 +10886,9 @@ static void ggml_compute_forward_set_f32(
bool inplace = (bool) ((int32_t *) dst->op_params)[4]; bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace && (params->type == GGML_TASK_INIT)) { if (!inplace && (params->type == GGML_TASK_INIT)) {
if (params->ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions. // memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase // => do it in INIT phase
memcpy( memcpy(
@ -11031,6 +11213,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
// ggml_compute_forward_dup_same_cont(params, opt0, dst); // ggml_compute_forward_dup_same_cont(params, opt0, dst);
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memset(dst->data, 0, ggml_nbytes(dst)); memset(dst->data, 0, ggml_nbytes(dst));
} }
@ -11065,6 +11250,9 @@ static void ggml_compute_forward_get_rows_back_f32(
// ggml_compute_forward_dup_same_cont(params, opt0, dst); // ggml_compute_forward_dup_same_cont(params, opt0, dst);
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memset(dst->data, 0, ggml_nbytes(dst)); memset(dst->data, 0, ggml_nbytes(dst));
} }
@ -11202,6 +11390,9 @@ static void ggml_compute_forward_diag_mask_f32(
GGML_ASSERT(n_past >= 0); GGML_ASSERT(n_past >= 0);
if (!inplace && (params->type == GGML_TASK_INIT)) { if (!inplace && (params->type == GGML_TASK_INIT)) {
if (ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions. // memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase // => do it in INIT phase
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@ -12172,6 +12363,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize); memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12266,6 +12460,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize); memset(params->wdata, 0, params->wsize);
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12464,6 +12661,7 @@ static void ggml_compute_forward_im2col(
} }
} }
// ggml_compute_forward_conv_transpose_2d // ggml_compute_forward_conv_transpose_2d
static void ggml_compute_forward_conv_transpose_2d( static void ggml_compute_forward_conv_transpose_2d(
@ -12489,6 +12687,9 @@ static void ggml_compute_forward_conv_transpose_2d(
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize); memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@ -13353,11 +13554,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int64_t D = neq0; const int64_t D = neq0;
const int64_t N = neq1; const int64_t N = neq1;
const int64_t P = nek1 - N;
GGML_ASSERT(ne0 == D); GGML_ASSERT(ne0 == D);
GGML_ASSERT(ne2 == N); GGML_ASSERT(ne2 == N);
GGML_ASSERT(P >= 0);
GGML_ASSERT(nbq0 == sizeof(float)); GGML_ASSERT(nbq0 == sizeof(float));
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
@ -13368,7 +13567,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
GGML_ASSERT(nev0 == D); GGML_ASSERT(nev0 == D);
GGML_ASSERT(neq1 == N); GGML_ASSERT(neq1 == N);
GGML_ASSERT(nek1 == N + P);
GGML_ASSERT(nev0 == D); GGML_ASSERT(nev0 == D);
// dst cannot be transposed or permuted // dst cannot be transposed or permuted
@ -13407,8 +13605,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
float scale = 1.0f; float scale = 1.0f;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
// loop over n_batch and n_head // loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) { for (int ir = ir0; ir < ir1; ++ir) {
// q indices // q indices
@ -14228,6 +14424,14 @@ static void ggml_compute_forward_unary(
{ {
ggml_compute_forward_silu(params, src0, dst); ggml_compute_forward_silu(params, src0, dst);
} break; } break;
case GGML_UNARY_OP_HARDSWISH:
{
ggml_compute_forward_hardswish(params, src0, dst);
} break;
case GGML_UNARY_OP_HARDSIGMOID:
{
ggml_compute_forward_hardsigmoid(params, src0, dst);
} break;
default: default:
{ {
GGML_ASSERT(false); GGML_ASSERT(false);
@ -14291,6 +14495,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
if (!inplace && params->type == GGML_TASK_INIT) { if (!inplace && params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
return; return;
} }
@ -14806,8 +15013,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} }
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#elif defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) {
ggml_vk_check_results_1(params, tensor);
}
#endif
if (skip_cpu) {
return;
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#ifdef GGML_USE_SYCL
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
if (skip_cpu) {
return;
}
#endif // GGML_USE_SYCL
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_DUP: case GGML_OP_DUP:
{ {
@ -16591,6 +16816,7 @@ struct ggml_compute_state_shared {
// synchronization primitives // synchronization primitives
atomic_int n_active; // num active threads atomic_int n_active; // num active threads
atomic_int node_n; // active graph node atomic_int node_n; // active graph node
atomic_int node_task; // active graph node task phase
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data; void * abort_callback_data;
@ -16646,6 +16872,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
{ {
n_tasks = 1; n_tasks = 1;
} break; } break;
@ -16722,7 +16950,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break; } break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
} break; } break;
case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_1D:
{ {
@ -16837,6 +17065,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
return n_tasks; return n_tasks;
} }
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
// wait for other threads to finish
const int last_node_n = * node_n;
while (true) {
if (do_yield) {
sched_yield();
}
* node_n = atomic_load(&state->shared->node_n);
if (* node_n != last_node_n) break;
}
}
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
// wait for other threads to finish
const int last_task_phase = * task_phase;
while (true) {
if (do_yield) {
sched_yield();
}
* task_phase = atomic_load(&state->shared->node_task);
if (* task_phase != last_task_phase) break;
}
}
static thread_ret_t ggml_graph_compute_thread(void * data) { static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_state * state = (struct ggml_compute_state *) data;
@ -16848,6 +17104,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
set_numa_thread_affinity(state->ith, n_threads); set_numa_thread_affinity(state->ith, n_threads);
int node_n = -1; int node_n = -1;
int task_phase = GGML_TASK_FINALIZE;
while (true) { while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@ -16879,7 +17136,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
// distribute new work or execute it direct if 1T // distribute new work or execute it direct if 1T
while (++node_n < cgraph->n_nodes) { while (++node_n < cgraph->n_nodes) {
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads); const int n_tasks = ggml_get_n_tasks(node, n_threads);
@ -16888,13 +17144,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
params.nth = n_tasks; params.nth = n_tasks;
if (n_tasks == 1) {
/* INIT */ /* INIT */
if (GGML_OP_HAS_INIT[node->op]) { if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_INIT; params.type = GGML_TASK_INIT;
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
} }
if (n_tasks == 1) {
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
// they do something more efficient than spinning (?) // they do something more efficient than spinning (?)
params.type = GGML_TASK_COMPUTE; params.type = GGML_TASK_COMPUTE;
@ -16915,38 +17171,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} }
} }
task_phase = GGML_TASK_INIT;
atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_n, node_n); atomic_store(&state->shared->node_n, node_n);
atomic_store(&state->shared->node_task, task_phase);
} else { } else {
// wait for other threads to finish ggml_graph_compute_thread_sync_node(&node_n, state, false);
const int last = node_n; ggml_graph_compute_thread_sync_task(&task_phase, state, false);
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
while (true) {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
// UPD: adding the do_yield flag seems to resolve the issue universally
if (do_yield) {
sched_yield();
}
node_n = atomic_load(&state->shared->node_n);
if (node_n != last) break;
};
} }
// check if we should stop // check if we should stop
if (node_n >= cgraph->n_nodes) break; if (node_n >= cgraph->n_nodes) break;
/* COMPUTE */ /* INIT & COMPUTE */
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads); const int n_tasks = ggml_get_n_tasks(node, n_threads);
struct ggml_compute_params params = { struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_COMPUTE, /*.type =*/ GGML_TASK_INIT,
/*.ith =*/ state->ith, /*.ith =*/ state->ith,
/*.nth =*/ n_tasks, /*.nth =*/ n_tasks,
/*.wsize =*/ cplan->work_size, /*.wsize =*/ cplan->work_size,
@ -16954,10 +17196,41 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}; };
if (state->ith < n_tasks) { if (state->ith < n_tasks) {
if (GGML_OP_HAS_INIT[node->op]) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
} }
} }
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
task_phase = GGML_TASK_COMPUTE;
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_task, task_phase);
}
else {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
// UPD: adding the do_yield flag seems to resolve the issue universally
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
}
if (state->ith < n_tasks) {
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
}
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
task_phase = GGML_TASK_FINALIZE;
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_task, task_phase);
}
else {
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
}
}
return GGML_EXIT_SUCCESS; return GGML_EXIT_SUCCESS;
} }
@ -17012,8 +17285,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node)) { if (ggml_compute_forward_mul_mat_use_blas(node)) {
if (node->src[0]->type != GGML_TYPE_F32) { if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0 // here we need memory for fully dequantized matrix from src0
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); // take into account that src0 can be broadcasted into src1[2,3]
cur = ggml_type_size(GGML_TYPE_F32)
* node->src[0]->ne[0]*node->src[0]->ne[1]
* node->src[1]->ne[2]*node->src[1]->ne[3];
} }
} else } else
#endif #endif
@ -17163,6 +17439,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
} }
} }
#ifdef GGML_USE_VULKAN
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
}
ggml_vk_preallocate_buffers();
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
}
#endif
const int n_threads = cplan->n_threads; const int n_threads = cplan->n_threads;
struct ggml_compute_state_shared state_shared = { struct ggml_compute_state_shared state_shared = {
@ -17173,6 +17460,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
/*.n_threads =*/ n_threads, /*.n_threads =*/ n_threads,
/*.n_active =*/ n_threads, /*.n_active =*/ n_threads,
/*.node_n =*/ -1, /*.node_n =*/ -1,
/*.node_task =*/ GGML_TASK_FINALIZE,
/*.abort_callback =*/ NULL, /*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL, /*.abort_callback_data =*/ NULL,
}; };
@ -17213,6 +17501,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
} }
} }
#ifdef GGML_USE_VULKAN
ggml_vk_graph_cleanup();
#endif
// performance stats (graph) // performance stats (graph)
{ {
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
@ -20347,7 +20639,7 @@ int ggml_cpu_has_wasm_simd(void) {
} }
int ggml_cpu_has_blas(void) { int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1; return 1;
#else #else
return 0; return 0;
@ -20370,8 +20662,24 @@ int ggml_cpu_has_clblast(void) {
#endif #endif
} }
int ggml_cpu_has_vulkan(void) {
#if defined(GGML_USE_VULKAN)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_sycl(void) {
#if defined(GGML_USE_SYCL)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_gpublas(void) { int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast(); return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl();
} }
int ggml_cpu_has_sse3(void) { int ggml_cpu_has_sse3(void) {

25
ggml.h
View file

@ -490,6 +490,8 @@ extern "C" {
GGML_UNARY_OP_GELU, GGML_UNARY_OP_GELU,
GGML_UNARY_OP_GELU_QUICK, GGML_UNARY_OP_GELU_QUICK,
GGML_UNARY_OP_SILU, GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_COUNT, GGML_UNARY_OP_COUNT,
}; };
@ -1033,6 +1035,16 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// hardswish(x) = x * relu6(x + 3) / 6
GGML_API struct ggml_tensor * ggml_hardswish(
struct ggml_context * ctx,
struct ggml_tensor * a);
// hardsigmoid(x) = relu6(x + 3) / 6
GGML_API struct ggml_tensor * ggml_hardsigmoid(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows // normalize along rows
GGML_API struct ggml_tensor * ggml_norm( GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1484,6 +1496,17 @@ extern "C" {
int d1, int d1,
bool is_2D); bool is_2D);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
GGML_API struct ggml_tensor * ggml_conv_1d( GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -2258,9 +2281,11 @@ extern "C" {
GGML_API int ggml_cpu_has_blas (void); GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
// //

2362
ggml_vk_generate_shaders.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -101,6 +101,7 @@ class MODEL_ARCH(IntEnum):
PHI2 = auto() PHI2 = auto()
PLAMO = auto() PLAMO = auto()
CODESHELL = auto() CODESHELL = auto()
ORION = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -151,6 +152,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.ORION: "orion",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -427,7 +429,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
] ],
MODEL_ARCH.ORION: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
# TODO # TODO
} }
@ -452,6 +470,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
], ],
MODEL_ARCH.ORION: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
} }
# #

View file

@ -107,7 +107,7 @@ class GGUFReader:
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count) offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
new_align = self.fields.get('general.alignment') new_align = self.fields.get('general.alignment')
if new_align is not None: if new_align is not None:
if new_align.types != [GGUFValueType.UINT64]: if new_align.types != [GGUFValueType.UINT32]:
raise ValueError('Bad type for general.alignment field') raise ValueError('Bad type for general.alignment field')
self.alignment = new_align.parts[-1][0] self.alignment = new_align.parts[-1][0]
padding = offs % self.alignment padding = offs % self.alignment

943
llama.cpp

File diff suppressed because it is too large Load diff

14
llama.h
View file

@ -6,6 +6,9 @@
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.h"
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
#else #else
#define LLAMA_MAX_DEVICES 1 #define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
@ -46,7 +49,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 4 #define LLAMA_SESSION_VERSION 4
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif
@ -107,6 +110,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
}; };
@ -774,6 +778,14 @@ extern "C" {
float p, float p,
size_t min_keep); size_t min_keep);
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
LLAMA_API void llama_sample_entropy(
struct llama_context * ctx,
llama_token_data_array * candidates_p,
float min_temp,
float max_temp,
float exponent_val);
LLAMA_API void llama_sample_temp( LLAMA_API void llama_sample_temp(
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates, llama_token_data_array * candidates,

View file

@ -4,3 +4,4 @@ allow_untyped_calls = true
allow_untyped_defs = true allow_untyped_defs = true
allow_incomplete_defs = true allow_incomplete_defs = true
disable_error_code = import-untyped disable_error_code = import-untyped
warn_return_any = false

View file

@ -243,7 +243,6 @@ int main(int argc, char** argv) {
if (useQ4_1) q41.resize(n4); if (useQ4_1) q41.resize(n4);
else q40.resize(n4); else q40.resize(n4);
std::vector<block_q8_0> q8(n8); std::vector<block_q8_0> q8(n8);
std::vector<int64_t> H(16, 0);
double sumt = 0, sumt2 = 0, maxt = 0; double sumt = 0, sumt2 = 0, maxt = 0;
double sumqt = 0, sumqt2 = 0, maxqt = 0; double sumqt = 0, sumqt2 = 0, maxqt = 0;
double sum = 0, sumq = 0, exactSum = 0; double sum = 0, sumq = 0, exactSum = 0;

50
scripts/ci-run.sh Executable file
View file

@ -0,0 +1,50 @@
#!/bin/bash
set -euo pipefail
this=$(realpath "$0"); readonly this
cd "$(dirname "$this")"
shellcheck "$this"
if (( $# != 1 && $# != 2 )); then
cat >&2 <<'EOF'
usage:
ci-run.sh <tmp_dir> [<cache_dir>]
This script wraps ci/run.sh:
* If <tmp_dir> is a ramdisk, you can reduce writes to your SSD. If <tmp_dir> is not a ramdisk, keep in mind that total writes will increase by the size of <cache_dir>.
(openllama_3b_v2: quantized models are about 30GB)
* Persistent model and data files are synced to and from <cache_dir>,
excluding generated .gguf files.
(openllama_3b_v2: persistent files are about 6.6GB)
* <cache_dir> defaults to ~/.cache/llama.cpp
EOF
exit 1
fi
cd .. # => llama.cpp repo root
tmp="$1"
mkdir -p "$tmp"
tmp=$(realpath "$tmp")
echo >&2 "Using tmp=$tmp"
cache="${2-$HOME/.cache/llama.cpp}"
mkdir -p "$cache"
cache=$(realpath "$cache")
echo >&2 "Using cache=$cache"
_sync() {
local from="$1"; shift
local to="$1"; shift
echo >&2 "Syncing from $from to $to"
mkdir -p "$from" "$to"
rsync -a "$from" "$to" --delete-during "$@"
}
_sync "$(realpath .)/" "$tmp/llama.cpp"
_sync "$cache/ci-mnt/models/" "$tmp/llama.cpp/ci-mnt/models/"
cd "$tmp/llama.cpp"
bash ci/run.sh ci-out ci-mnt
_sync 'ci-mnt/models/' "$cache/ci-mnt/models/" --exclude='*.gguf' -P

View file

@ -46,7 +46,7 @@ Formatting considerations:
- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings. - To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
- To define a tensor split, pass a list of floats. - To define a tensor split, pass a list of floats.
""" """
usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]" usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
epilog = (" --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). " epilog = (" --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
"Unknown args will be ignored.") "Unknown args will be ignored.")

View file

@ -1 +1 @@
6c1ce0bd591a430c1d3f6797d905194581c878c1 f2a9472b23cf27e672ed70a2a6eb078f7b060f18

3
tests/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
*
!*.*
test-c.o

View file

@ -1,6 +1,6 @@
function(llama_build_executable source) function(llama_build_executable source)
get_filename_component(TEST_TARGET ${source} NAME_WE) get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source}) add_executable(${TEST_TARGET} ${source} get-model.cpp)
install(TARGETS ${TEST_TARGET} RUNTIME) install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE common) target_link_libraries(${TEST_TARGET} PRIVATE common)
endfunction() endfunction()
@ -8,14 +8,20 @@ endfunction()
function(llama_test_executable name source) function(llama_test_executable name source)
get_filename_component(TEST_TARGET ${source} NAME_WE) get_filename_component(TEST_TARGET ${source} NAME_WE)
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN}) add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
set_property(TEST ${name} PROPERTY LABELS "main")
endfunction() endfunction()
function(llama_build_and_test_executable source) function(llama_build_and_test_executable source)
llama_build_and_test_executable_with_label(${source} "main")
endfunction()
function(llama_build_and_test_executable_with_label source label)
get_filename_component(TEST_TARGET ${source} NAME_WE) get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source}) add_executable(${TEST_TARGET} ${source} get-model.cpp)
install(TARGETS ${TEST_TARGET} RUNTIME) install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE common) target_link_libraries(${TEST_TARGET} PRIVATE common)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN}) add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
endfunction() endfunction()
# llama_build_and_test_executable(test-double-float.cpp) # SLOW # llama_build_and_test_executable(test-double-float.cpp) # SLOW
@ -49,12 +55,14 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
llama_build_and_test_executable(test-grad0.cpp) llama_build_and_test_executable(test-grad0.cpp)
# llama_build_and_test_executable(test-opt.cpp) # SLOW # llama_build_and_test_executable(test-opt.cpp) # SLOW
llama_build_and_test_executable(test-backend-ops.cpp) llama_build_and_test_executable(test-backend-ops.cpp)
llama_build_and_test_executable(test-autorelease.cpp)
llama_build_and_test_executable(test-rope.cpp) llama_build_and_test_executable(test-rope.cpp)
llama_build_executable(test-flash-attention.cpp) llama_build_executable(test-flash-attention.cpp)
llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
llama_build_and_test_executable_with_label(test-autorelease.cpp "model")
# dummy executable - not installed # dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE) get_filename_component(TEST_TARGET test-c.c NAME_WE)
add_executable(${TEST_TARGET} test-c.c) add_executable(${TEST_TARGET} test-c.c)

21
tests/get-model.cpp Normal file
View file

@ -0,0 +1,21 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include "get-model.h"
char * get_model_or_exit(int argc, char *argv[]) {
char * model_path;
if (argc > 1) {
model_path = argv[1];
} else {
model_path = getenv("LLAMACPP_TEST_MODELFILE");
if (!model_path || strlen(model_path) == 0) {
fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
exit(EXIT_SUCCESS);
}
}
return model_path;
}

2
tests/get-model.h Normal file
View file

@ -0,0 +1,2 @@
#pragma once
char * get_model_or_exit(int, char*[]);

View file

@ -5,19 +5,15 @@
#include <thread> #include <thread>
#include "llama.h" #include "llama.h"
#include "get-model.h"
// This creates a new context inside a pthread and then tries to exit cleanly. // This creates a new context inside a pthread and then tries to exit cleanly.
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 2) { auto * model_path = get_model_or_exit(argc, argv);
printf("Usage: %s model.gguf\n", argv[0]);
return 0; // intentionally return success
}
const std::string fname = argv[1]; std::thread([&model_path]() {
std::thread([&fname]() {
llama_backend_init(false); llama_backend_init(false);
auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params()); auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
auto * ctx = llama_new_context_with_model(model, llama_context_default_params()); auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View file

@ -102,7 +102,6 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
} else if (t->type == GGML_TYPE_I8) { } else if (t->type == GGML_TYPE_I8) {
tv.push_back((float)*(int8_t *) &buf[i]); tv.push_back((float)*(int8_t *) &buf[i]);
} else if (quantized) { } else if (quantized) {
std::vector<float> vq(ggml_blck_size(t->type));
tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type)); tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
tv.insert(tv.end(), vq.begin(), vq.end()); tv.insert(tv.end(), vq.begin(), vq.end());
} else { } else {
@ -240,10 +239,17 @@ static std::string var_to_str(ggml_type type) {
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j) #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k) #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
#ifdef GGML_USE_SYCL
static bool inline _isinf(float f) {
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
}
#else
static bool inline _isinf(float f) { return std::isinf(f); }
#endif
// accept FLT_MAX as infinity // accept FLT_MAX as infinity
static bool isinf_or_max(float f) { static bool isinf_or_max(float f) {
return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX; return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
} }
static bool ggml_is_view_op(enum ggml_op op) { static bool ggml_is_view_op(enum ggml_op op) {
@ -1396,7 +1402,7 @@ struct test_flash_attn_ext : public test_case {
} }
double max_nmse_err() override { double max_nmse_err() override {
return 5e-5; return 5e-4;
} }
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8) test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
@ -1412,6 +1418,48 @@ struct test_flash_attn_ext : public test_case {
} }
}; };
// Attention
struct test_attn : public test_case {
const int64_t hs; // head size
const int64_t nh; // num heads
const int64_t kv; // kv size
const int64_t nb; // batch size
std::string op_desc(ggml_tensor * t) override {
return "ATTN";
GGML_UNUSED(t);
}
std::string vars() override {
return VARS_TO_STR4(hs, nh, kv, nb);
}
double max_nmse_err() override {
return 5e-4;
}
test_attn(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
: hs(hs), nh(nh), kv(kv), nb(nb) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, hs, nh, 1); // transposed
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv, nb, 1, 1);
struct ggml_tensor * cur;
cur = ggml_mul_mat (ctx, k, q);
cur = ggml_soft_max_ext(ctx, cur, mask, 1.0f/sqrtf(hs));
cur = ggml_mul_mat (ctx, v, cur);
cur = ggml_permute (ctx, cur, 0, 2, 1, 3);
cur = ggml_cont_2d (ctx, cur, hs*nh, nb);
return cur;
}
};
// Mixtral MOE // Mixtral MOE
struct test_moe : public test_case { struct test_moe : public test_case {
const int n_experts; const int n_experts;
@ -1678,9 +1726,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_leaky_relu()); test_cases.emplace_back(new test_leaky_relu());
test_cases.emplace_back(new test_flash_attn_ext(128, 32, 256, 8)); for (int hs : { 64, 80, 96, 112, 128, 256, }) {
test_cases.emplace_back(new test_flash_attn_ext(128, 32, 256, 7)); for (int nh : { 32, }) {
test_cases.emplace_back(new test_flash_attn_ext(128, 32, 256, 1)); for (int kv : { 512, 1024, 2048, 4096, }) {
for (int nb : { 1, 2, 4, 8, 512, 1024, 2048, }) {
test_cases.emplace_back(new test_attn (hs, nh, kv, nb));
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
}
}
}
}
#if !defined(__SANITIZE_THREAD__) #if !defined(__SANITIZE_THREAD__)
// FIXME: these tests use too much memory with thread sanitizer // FIXME: these tests use too much memory with thread sanitizer

View file

@ -190,7 +190,6 @@ int main()
index++; index++;
} }
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
std::vector<llama_grammar_candidate> next_candidates; std::vector<llama_grammar_candidate> next_candidates;
next_candidates.resize(24); next_candidates.resize(24);

View file

@ -0,0 +1,27 @@
#include "llama.h"
#include "get-model.h"
#include <cstdlib>
int main(int argc, char *argv[] ) {
auto * model_path = get_model_or_exit(argc, argv);
auto * file = fopen(model_path, "r");
if (file == nullptr) {
fprintf(stderr, "no model at '%s' found\n", model_path);
return EXIT_FAILURE;
}
fprintf(stderr, "using '%s'\n", model_path);
fclose(file);
llama_backend_init(false);
auto params = llama_model_params{};
params.use_mmap = false;
params.progress_callback = [](float progress, void * ctx){
(void) ctx;
return progress > 0.50;
};
auto * model = llama_load_model_from_file(model_path, params);
llama_backend_free();
return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
}

View file

@ -5,11 +5,10 @@
#undef NDEBUG #undef NDEBUG
#endif #endif
#include <cmath>
#include <numeric>
#include <cassert>
#include <vector>
#include <algorithm> #include <algorithm>
#include <cmath>
#include <string>
#include <vector>
static void dump(const llama_token_data_array * candidates) { static void dump(const llama_token_data_array * candidates) {
for (size_t i = 0; i < candidates->size; i++) { for (size_t i = 0; i < candidates->size; i++) {
@ -20,11 +19,11 @@ static void dump(const llama_token_data_array * candidates) {
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) { static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
size_t n_vocab = probs.size(); const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
float logit = log(probs[token_id]); const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
} }
@ -41,11 +40,11 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
} }
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) { static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
size_t n_vocab = probs.size(); const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
float logit = log(probs[token_id]); const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
} }
@ -62,11 +61,11 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
} }
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) { static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
size_t n_vocab = probs.size(); const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
float logit = log(probs[token_id]); const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
} }
@ -81,12 +80,33 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
} }
} }
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) { static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
size_t n_vocab = probs.size(); const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
float logit = log(probs[token_id]); const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
DUMP(&candidates_p);
llama_sample_min_p(nullptr, &candidates_p, p, 1);
DUMP(&candidates_p);
llama_sample_softmax(nullptr, &candidates_p);
GGML_ASSERT(candidates_p.size == expected_probs.size());
for (size_t i = 0; i < candidates_p.size; i++) {
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
}
}
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
} }
@ -107,11 +127,11 @@ static void test_repetition_penalties(
) { ) {
GGML_ASSERT(probs.size() == expected_probs.size()); GGML_ASSERT(probs.size() == expected_probs.size());
size_t n_vocab = probs.size(); const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
float logit = log(probs[token_id]); const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
} }
@ -128,6 +148,88 @@ static void test_repetition_penalties(
} }
} }
static void test_sampler_queue(
const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
) {
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(token_id);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_token min_token_id = 0;
const llama_token max_token_id = n_vocab-1;
for (auto s : samplers_sequence) {
switch (s){
case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break;
case 'y': GGML_ASSERT(false && "typical test not implemented"); break;
case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
default : GGML_ASSERT(false && "Unknown sampler"); break;
}
llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
const int size = candidates_p.size;
if (s == 'k') {
const int expected_size = std::min(size, top_k);
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
GGML_ASSERT(size == expected_size);
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
} else if (s == 'p') {
const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
min_token_id = n_vocab;
int expected_size = 0;
int cumsum = 0;
do { // do-while because always at least one token is sampled
min_token_id--;
expected_size++;
cumsum += min_token_id;
} while (cumsum < softmax_numerator_target);
// token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
if (min_token_id == 1) {
min_token_id--;
expected_size += 1;
}
GGML_ASSERT(size == expected_size);
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
} else if (s == 'm') {
int expected_size = ceilf((1.0f-min_p) * n_vocab);
expected_size = std::max(expected_size, 1);
expected_size = std::min(expected_size, size);
min_token_id = floorf(min_p * n_vocab);
min_token_id = std::max(min_token_id, 1);
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
GGML_ASSERT(size == expected_size);
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
} else {
GGML_ASSERT(false);
}
}
printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
}
int main(void) { int main(void) {
ggml_time_init(); ggml_time_init();
@ -139,6 +241,15 @@ int main(void) {
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.26f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.49f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.51f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.74f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
@ -154,6 +265,34 @@ int main(void) {
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f); test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f); test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);
test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
test_sampler_queue(10000, "k", 100, 1.0000f, 1.0f);
test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
printf("OK\n"); printf("OK\n");
return 0; return 0;

View file

@ -2,8 +2,9 @@
#include <cassert> #include <cassert>
#include <stdexcept> #include <stdexcept>
#include <vector> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector>
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = { static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},