Merge branch 'master' into compilade/bitnet-ternary

This commit is contained in:
Francis Couture-Harpin 2024-09-04 13:26:50 -04:00
commit 7f3a619c98
94 changed files with 12171 additions and 7726 deletions

View file

@ -1,18 +1,16 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1 ARG CUDA_VERSION=12.6.0
# Target the CUDA build image # Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt
COPY requirements requirements COPY requirements requirements
@ -24,13 +22,12 @@ WORKDIR /app
COPY . . COPY . .
# Set nvcc architecture # Use the default CUDA archs if not specified
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
# Enable CUDA export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
ENV GGML_CUDA=1 fi && \
# Enable cURL cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
ENV LLAMA_CURL=1 cmake --build build --config Release -j$(nproc) && \
cp build/bin/* .
RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"] ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1 ARG CUDA_VERSION=12.6.0
# Target the CUDA build image # Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image # Target the CUDA runtime image
@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
FROM ${BASE_CUDA_DEV_CONTAINER} AS build FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git apt-get install -y build-essential git cmake
WORKDIR /app WORKDIR /app
COPY . . COPY . .
# Set nvcc architecture # Use the default CUDA archs if not specified
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
# Enable CUDA export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
ENV GGML_CUDA=1 fi && \
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
RUN make -j$(nproc) llama-cli cmake --build build --config Release --target llama-cli -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/llama-cli" ]

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1 ARG CUDA_VERSION=12.6.0
# Target the CUDA build image # Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image # Target the CUDA runtime image
@ -8,31 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
FROM ${BASE_CUDA_DEV_CONTAINER} AS build FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app WORKDIR /app
COPY . . COPY . .
# Set nvcc architecture # Use the default CUDA archs if not specified
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
# Enable CUDA export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
ENV GGML_CUDA=1 fi && \
# Enable cURL cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
ENV LLAMA_CURL=1 cmake --build build --config Release --target llama-server -j$(nproc)
RUN make -j$(nproc) llama-server
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/llama-server /llama-server COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -26,6 +26,8 @@ RUN apt-get update && \
COPY --from=build /app/build/bin/llama-server /llama-server COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV GGML_HIPBLAS=1 ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
# Enable cURL # Enable cURL
ENV LLAMA_CURL=1 ENV LLAMA_CURL=1

View file

@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app rm -rf /app
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -21,6 +21,8 @@ RUN apt-get update && \
COPY --from=build /app/llama-server /llama-server COPY --from=build /app/llama-server /llama-server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -1,13 +1,52 @@
{ inputs, ... }:
{ {
perSystem = perSystem =
{ config, lib, ... }: {
config,
lib,
system,
...
}:
{ {
devShells = devShells =
lib.concatMapAttrs let
(name: package: { pkgs = import inputs.nixpkgs { inherit system; };
${name} = package.passthru.shell; stdenv = pkgs.stdenv;
${name + "-extra"} = package.passthru.shell-extra; scripts = config.packages.python-scripts;
}) in
config.packages; lib.pipe (config.packages) [
(lib.concatMapAttrs (
name: package: {
${name} = pkgs.mkShell {
name = "${name}";
inputsFrom = [ package ];
shellHook = ''
echo "Entering ${name} devShell"
'';
};
"${name}-extra" =
if (name == "python-scripts") then
null
else
pkgs.mkShell {
name = "${name}-extra";
inputsFrom = [
package
scripts
];
# Extra packages that *may* be used by some scripts
packages = [
pkgs.python3Packages.tiktoken
];
shellHook = ''
echo "Entering ${name} devShell"
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
'';
};
}
))
(lib.filterAttrs (name: value: value != null))
];
}; };
} }

View file

@ -26,16 +26,14 @@
config.cudaSupport = true; config.cudaSupport = true;
config.allowUnfreePredicate = config.allowUnfreePredicate =
p: p:
builtins.all builtins.all (
(
license: license:
license.free license.free
|| builtins.elem license.shortName [ || builtins.elem license.shortName [
"CUDA EULA" "CUDA EULA"
"cuDNN EULA" "cuDNN EULA"
] ]
) ) (p.meta.licenses or [ p.meta.license ]);
(p.meta.licenses or [ p.meta.license ]);
}; };
# Ensure dependencies use ROCm consistently # Ensure dependencies use ROCm consistently
pkgsRocm = import inputs.nixpkgs { pkgsRocm = import inputs.nixpkgs {

View file

@ -0,0 +1,36 @@
{
lib,
llamaVersion,
numpy,
tqdm,
sentencepiece,
pyyaml,
poetry-core,
buildPythonPackage,
pytestCheckHook,
}:
buildPythonPackage {
pname = "gguf";
version = llamaVersion;
pyproject = true;
nativeBuildInputs = [ poetry-core ];
propagatedBuildInputs = [
numpy
tqdm
sentencepiece
pyyaml
];
src = lib.cleanSource ../../gguf-py;
pythonImportsCheck = [
"numpy"
"gguf"
];
nativeCheckInputs = [ pytestCheckHook ];
doCheck = true;
meta = with lib; {
description = "Python package for writing binary files in the GGUF format";
license = licenses.mit;
maintainers = [ maintainers.ditsuke ];
};
}

View file

@ -3,31 +3,33 @@
glibc, glibc,
config, config,
stdenv, stdenv,
mkShell,
runCommand, runCommand,
cmake, cmake,
ninja, ninja,
pkg-config, pkg-config,
git, git,
python3,
mpi, mpi,
blas, blas,
cudaPackages, cudaPackages,
autoAddDriverRunpath,
darwin, darwin,
rocmPackages, rocmPackages,
vulkan-headers, vulkan-headers,
vulkan-loader, vulkan-loader,
curl, curl,
shaderc, shaderc,
useBlas ? builtins.all (x: !x) [ useBlas ?
builtins.all (x: !x) [
useCuda useCuda
useMetalKit useMetalKit
useRocm useRocm
useVulkan useVulkan
] && blas.meta.available, ]
&& blas.meta.available,
useCuda ? config.cudaSupport, useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin, useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
useMpi ? false, # Increases the runtime closure size by ~700M # Increases the runtime closure size by ~700M
useMpi ? false,
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
enableCurl ? true, enableCurl ? true,
useVulkan ? false, useVulkan ? false,
@ -37,8 +39,8 @@
# otherwise we get libstdc++ errors downstream. # otherwise we get libstdc++ errors downstream.
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv, effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic, enableStatic ? effectiveStdenv.hostPlatform.isStatic,
precompileMetalShaders ? false precompileMetalShaders ? false,
}@inputs: }:
let let
inherit (lib) inherit (lib)
@ -46,7 +48,6 @@ let
cmakeFeature cmakeFeature
optionals optionals
strings strings
versionOlder
; ;
stdenv = throw "Use effectiveStdenv instead"; stdenv = throw "Use effectiveStdenv instead";
@ -62,52 +63,9 @@ let
pnameSuffix = pnameSuffix =
strings.optionalString (suffices != [ ]) strings.optionalString (suffices != [ ])
"-${strings.concatMapStringsSep "-" strings.toLower suffices}"; "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
descriptionSuffix = descriptionSuffix = strings.optionalString (
strings.optionalString (suffices != [ ]) suffices != [ ]
", accelerated with ${strings.concatStringsSep ", " suffices}"; ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
# TODO: package the Python in this repository in a Nix-like way.
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
# https://peps.python.org/pep-0517/
#
# TODO: Package up each Python script or service appropriately, by making
# them into "entrypoints"
llama-python = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
]
);
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
llama-python-extra = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
ps.tiktoken
ps.torchWithoutCuda
ps.transformers
# server bench
ps.matplotlib
# server tests
ps.openai
ps.behave
ps.prometheus-client
# for examples/pydantic-models-to-grammar-examples.py
ps.docstring-parser
ps.pydantic
# for scripts/compare-llama-bench.py
ps.gitpython
ps.tabulate
]
);
xcrunHost = runCommand "xcrunHost" { } '' xcrunHost = runCommand "xcrunHost" { } ''
mkdir -p $out/bin mkdir -p $out/bin
@ -144,8 +102,7 @@ let
]; ];
in in
effectiveStdenv.mkDerivation ( effectiveStdenv.mkDerivation (finalAttrs: {
finalAttrs: {
pname = "llama-cpp${pnameSuffix}"; pname = "llama-cpp${pnameSuffix}";
version = llamaVersion; version = llamaVersion;
@ -193,15 +150,10 @@ effectiveStdenv.mkDerivation (
++ optionals useCuda [ ++ optionals useCuda [
cudaPackages.cuda_nvcc cudaPackages.cuda_nvcc
# TODO: Replace with autoAddDriverRunpath autoAddDriverRunpath
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
cudaPackages.autoAddOpenGLRunpathHook
] ]
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
glibc.static ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
xcrunHost
];
buildInputs = buildInputs =
optionals effectiveStdenv.isDarwin darwinBuildInputs optionals effectiveStdenv.isDarwin darwinBuildInputs
@ -256,35 +208,6 @@ effectiveStdenv.mkDerivation (
cp $src/include/llama.h $out/include/ cp $src/include/llama.h $out/include/
''; '';
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
passthru = {
inherit
useBlas
useCuda
useMetalKit
useMpi
useRocm
useVulkan
;
shell = mkShell {
name = "shell-${finalAttrs.finalPackage.name}";
description = "contains numpy and sentencepiece";
buildInputs = [ llama-python ];
inputsFrom = [ finalAttrs.finalPackage ];
shellHook = ''
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
'';
};
shell-extra = mkShell {
name = "shell-extra-${finalAttrs.finalPackage.name}";
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
buildInputs = [ llama-python-extra ];
inputsFrom = [ finalAttrs.finalPackage ];
};
};
meta = { meta = {
# Configurations we don't want even the CI to evaluate. Results in the # Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because # "unsupported platform" messages. This is mostly a no-op, because
@ -320,5 +243,4 @@ effectiveStdenv.mkDerivation (
# Extend `badPlatforms` instead # Extend `badPlatforms` instead
platforms = lib.platforms.all; platforms = lib.platforms.all;
}; };
} })
)

View file

@ -0,0 +1,66 @@
{
lib,
stdenv,
buildPythonPackage,
poetry-core,
mkShell,
python3Packages,
gguf-py,
}@inputs:
let
llama-python-deps = with python3Packages; [
numpy
sentencepiece
transformers
protobuf
torchWithoutCuda
gguf-py
tqdm
# for scripts/compare-llama-bench.py
gitpython
tabulate
# for examples/pydantic-models-to-grammar-examples.py
docstring-parser
pydantic
];
llama-python-test-deps = with python3Packages; [
# Server bench
matplotlib
# server tests
openai
behave
prometheus-client
];
in
buildPythonPackage ({
pname = "llama-scripts";
version = "0.0.0";
pyproject = true;
# NOTE: The files filtered out here are not visible in the build sandbox, neither
# do they affect the output hash. They can be modified without triggering a rebuild.
src = lib.cleanSourceWith {
filter =
name: type:
let
any = builtins.any (x: x);
baseName = builtins.baseNameOf name;
in
any [
(lib.hasSuffix ".py" name)
(baseName == "README.md")
(baseName == "pyproject.toml")
];
src = lib.cleanSource ../../.;
};
nativeBuildInputs = [ poetry-core ];
nativeCheckInputs = llama-python-test-deps;
dependencies = llama-python-deps;
})

View file

@ -1,19 +1,41 @@
{ {
lib, lib,
newScope, newScope,
python3,
llamaVersion ? "0.0.0", llamaVersion ? "0.0.0",
}: }:
let
pythonPackages = python3.pkgs;
buildPythonPackage = pythonPackages.buildPythonPackage;
numpy = pythonPackages.numpy;
tqdm = pythonPackages.tqdm;
sentencepiece = pythonPackages.sentencepiece;
pyyaml = pythonPackages.pyyaml;
poetry-core = pythonPackages.poetry-core;
pytestCheckHook = pythonPackages.pytestCheckHook;
in
# We're using `makeScope` instead of just writing out an attrset # We're using `makeScope` instead of just writing out an attrset
# because it allows users to apply overlays later using `overrideScope'`. # because it allows users to apply overlays later using `overrideScope'`.
# Cf. https://noogle.dev/f/lib/makeScope # Cf. https://noogle.dev/f/lib/makeScope
lib.makeScope newScope ( lib.makeScope newScope (self: {
self: {
inherit llamaVersion; inherit llamaVersion;
gguf-py = self.callPackage ./package-gguf-py.nix {
inherit
buildPythonPackage
numpy
tqdm
sentencepiece
poetry-core
pyyaml
pytestCheckHook
;
};
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
llama-cpp = self.callPackage ./package.nix { }; llama-cpp = self.callPackage ./package.nix { };
docker = self.callPackage ./docker.nix { }; docker = self.callPackage ./docker.nix { };
docker-min = self.callPackage ./docker.nix { interactive = false; }; docker-min = self.callPackage ./docker.nix { interactive = false; };
sif = self.callPackage ./sif.nix { }; sif = self.callPackage ./sif.nix { };
} })
)

2
.ecrc
View file

@ -1,5 +1,5 @@
{ {
"Exclude": ["^\\.gitmodules$"], "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
"Disable": { "Disable": {
"IndentSize": true "IndentSize": true
} }

View file

@ -96,21 +96,12 @@ jobs:
env: env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Build and push Docker image (versioned) - name: Build and push Docker image (tagged + versioned)
if: github.event_name == 'push' if: github.event_name == 'push'
uses: docker/build-push-action@v4 uses: docker/build-push-action@v6
with: with:
context: . context: .
push: true push: true
platforms: ${{ matrix.config.platforms }} platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }} file: ${{ matrix.config.dockerfile }}

View file

@ -10,32 +10,14 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
## Recent API changes ## Recent API changes
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006 - [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 - [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
## Hot topics ## Hot topics
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430 - *add hot topics here*
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
---- ----

View file

@ -13,6 +13,9 @@
# # with SYCL support # # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# #
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>" echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi fi
if [ ! -z ${GG_BUILD_CUDA} ]; then if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1" CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
fi fi
if [ ! -z ${GG_BUILD_SYCL} ]; then if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
## helpers ## helpers
# download a file if it does not exist or if it is outdated # download a file if it does not exist or if it is outdated
@ -107,7 +114,7 @@ function gg_run_ctest_debug {
gg_check_build_requirements gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -138,7 +145,7 @@ function gg_run_ctest_release {
gg_check_build_requirements gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
} }
# open_llama_7b_v2 # open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_open_llama_7b_v2 { function gg_run_open_llama_7b_v2 {
cd ${SRC} cd ${SRC}
@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
} }
# pythia_2_8b # pythia_2_8b
# requires: GG_BUILD_CUDA
function gg_run_pythia_2_8b { function gg_run_pythia_2_8b {
cd ${SRC} cd ${SRC}
@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -761,7 +766,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
test $ret -eq 0 && gg_run pythia_1_4b test $ret -eq 0 && gg_run pythia_1_4b
else else
test $ret -eq 0 && gg_run pythia_2_8b test $ret -eq 0 && gg_run pythia_2_8b

View file

@ -251,6 +251,57 @@ int32_t cpu_get_num_math() {
return cpu_get_num_physical_cores(); return cpu_get_num_physical_cores();
} }
// Helper for setting process priority
#if defined(_WIN32)
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
DWORD p = NORMAL_PRIORITY_CLASS;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
}
if (!SetPriorityClass(GetCurrentProcess(), p)) {
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false;
}
return true;
}
#else // MacOS and POSIX
#include <sys/types.h>
#include <sys/resource.h>
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
int p = 0;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
case GGML_SCHED_PRIO_HIGH: p = -10; break;
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
}
if (!setpriority(PRIO_PROCESS, 0, p)) {
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false;
}
return true;
}
#endif
// //
// CLI argument parsing // CLI argument parsing
// //
@ -277,6 +328,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
} }
} }
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
int32_t n_set = 0;
if (cpuparams.n_threads < 0) {
// Assuming everything about cpuparams is invalid
if (role_model != nullptr) {
cpuparams = *role_model;
} else {
cpuparams.n_threads = cpu_get_num_math();
}
}
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
if (cpuparams.cpumask[i]) {
n_set++;
}
}
if (n_set && n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues.
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
}
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false; bool invalid_param = false;
std::string arg; std::string arg;
@ -296,6 +371,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} }
} }
postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
} }
@ -327,7 +407,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
void gpt_params_parse_from_env(gpt_params & params) { void gpt_params_parse_from_env(gpt_params & params) {
// we only care about server-related params for now // we only care about server-related params for now
get_env("LLAMA_ARG_MODEL", params.model); get_env("LLAMA_ARG_MODEL", params.model);
get_env("LLAMA_ARG_THREADS", params.n_threads); get_env("LLAMA_ARG_MODEL_URL", params.model_url);
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx); get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel); get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
get_env("LLAMA_ARG_BATCH", params.n_batch); get_env("LLAMA_ARG_BATCH", params.n_batch);
@ -341,6 +425,9 @@ void gpt_params_parse_from_env(gpt_params & params) {
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
get_env("LLAMA_ARG_HOST", params.hostname);
get_env("LLAMA_ARG_PORT", params.port);
} }
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
@ -361,6 +448,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return true; return true;
} }
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) {
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
return false;
}
size_t start_i;
size_t end_i;
if (dash_loc == 0) {
start_i = 0;
} else {
start_i = std::stoull(range.substr(0, dash_loc));
if (start_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "Start index out of bounds!\n");
return false;
}
}
if (dash_loc == range.length() - 1) {
end_i = GGML_MAX_N_THREADS - 1;
} else {
end_i = std::stoull(range.substr(dash_loc + 1));
if (end_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "End index out of bounds!\n");
return false;
}
}
for (size_t i = start_i; i <= end_i; i++) {
boolmask[i] = true;
}
return true;
}
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
// Discard potential 0x prefix
size_t start_i = 0;
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
start_i = 2;
}
size_t num_digits = mask.length() - start_i;
if (num_digits > 128) num_digits = 128;
size_t end_i = num_digits + start_i;
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
char c = mask.at(i);
int8_t id = c;
if ((c >= '0' && c <= '9')) {
id -= '0';
} else if (c >= 'a' && c <= 'f') {
id -= 'a' - 10;
} else if (c >= 'A' && c <= 'F') {
id -= 'A' - 10;
} else {
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
return false;
}
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
}
return true;
}
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@ -377,36 +537,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
if (arg == "-t" || arg == "--threads") { if (arg == "-t" || arg == "--threads") {
CHECK_ARG CHECK_ARG
params.n_threads = std::stoi(argv[i]); params.cpuparams.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) { if (params.cpuparams.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency(); params.cpuparams.n_threads = std::thread::hardware_concurrency();
} }
return true; return true;
} }
if (arg == "-C" || arg == "--cpu-mask") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
return true;
}
if (arg == "-Cr" || arg == "--cpu-range") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
return true;
}
if (arg == "--prio") {
CHECK_ARG
params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict") {
CHECK_ARG
params.cpuparams.strict_cpu = std::stoul(argv[i]);
return true;
}
if (arg == "--poll") {
CHECK_ARG
params.cpuparams.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-tb" || arg == "--threads-batch") { if (arg == "-tb" || arg == "--threads-batch") {
CHECK_ARG CHECK_ARG
params.n_threads_batch = std::stoi(argv[i]); params.cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.n_threads_batch <= 0) { if (params.cpuparams_batch.n_threads <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency(); params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
} }
return true; return true;
} }
if (arg == "-Cb" || arg == "--cpu-mask-batch") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "-Crb" || arg == "--cpu-range_batch") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch") {
CHECK_ARG
params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch") {
params.cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch") {
CHECK_ARG
params.cpuparams_batch.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-td" || arg == "--threads-draft") { if (arg == "-td" || arg == "--threads-draft") {
CHECK_ARG CHECK_ARG
params.n_threads_draft = std::stoi(argv[i]); params.draft_cpuparams.n_threads = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) { if (params.draft_cpuparams.n_threads <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency(); params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
} }
return true; return true;
}
if (arg == "-Cd" || arg == "--cpu-mask-draft") {
CHECK_ARG
std::string mask = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "-Crd" || arg == "--cpu-range-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "--prio-draft") {
CHECK_ARG
params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-draft") {
params.draft_cpuparams.strict_cpu = true;
return true;
}
if (arg == "--poll-draft") {
CHECK_ARG
params.draft_cpuparams.poll = std::stoul(argv[i]);
return true;
} }
if (arg == "-tbd" || arg == "--threads-batch-draft") { if (arg == "-tbd" || arg == "--threads-batch-draft") {
CHECK_ARG CHECK_ARG
params.n_threads_batch_draft = std::stoi(argv[i]); params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) { if (params.draft_cpuparams_batch.n_threads <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency(); params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
} }
return true; return true;
} }
if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch-draft") {
params.draft_cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-p" || arg == "--prompt") { if (arg == "-p" || arg == "--prompt") {
CHECK_ARG CHECK_ARG
params.prompt = argv[i]; params.prompt = argv[i];
@ -901,7 +1167,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
return true; return true;
} }
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
CHECK_ARG CHECK_ARG
params.n_gpu_layers_draft = std::stoi(argv[i]); params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) { if (!llama_supports_gpu_offload()) {
@ -968,11 +1234,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
#endif // GGML_USE_CUDA_SYCL_VULKAN #endif // GGML_USE_CUDA_SYCL_VULKAN
return true; return true;
} }
#ifdef GGML_USE_RPC
if (arg == "--rpc") { if (arg == "--rpc") {
CHECK_ARG CHECK_ARG
params.rpc_servers = argv[i]; params.rpc_servers = argv[i];
return true; return true;
} }
#endif
if (arg == "--no-mmap") { if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
return true; return true;
@ -1491,11 +1759,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
options.push_back({ "speculative", "-tbd, --threads-batch-draft N", options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
#ifndef GGML_USE_OPENMP
// these options are available only with the internal threadpool
options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"});
options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"});
options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>",
"Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
#endif // GGML_USE_OPENMP
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
@ -1634,7 +1931,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
options.push_back({ "backend" }); options.push_back({ "backend" });
#ifdef GGML_USE_RPC
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
#endif
if (llama_supports_mlock()) { if (llama_supports_mlock()) {
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
@ -1767,7 +2066,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
@ -1799,9 +2097,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
std::string gpt_params_get_system_info(const gpt_params & params) { std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os; std::ostringstream os;
os << "system_info: n_threads = " << params.n_threads; os << "system_info: n_threads = " << params.cpuparams.n_threads;
if (params.n_threads_batch != -1) { if (params.cpuparams_batch.n_threads != -1) {
os << " (n_threads_batch = " << params.n_threads_batch << ")"; os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
} }
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64 // TODO: windows + arm64 + mingw64
@ -1861,13 +2159,19 @@ std::string string_get_sortable_timestamp() {
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) { if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string return;
} }
std::string builder;
builder.reserve(s.length());
size_t pos = 0; size_t pos = 0;
while ((pos = s.find(search, pos)) != std::string::npos) { size_t last_pos = 0;
s.replace(pos, search.length(), replace); while ((pos = s.find(search, last_pos)) != std::string::npos) {
pos += replace.length(); builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
} }
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
} }
void string_process_escapes(std::string & input) { void string_process_escapes(std::string & input) {
@ -2319,8 +2623,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_seq_max = params.n_parallel; cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch; cparams.n_batch = params.n_batch;
cparams.n_ubatch = params.n_ubatch; cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads; cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.seed = params.seed; cparams.seed = params.seed;
cparams.logits_all = params.logits_all; cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding; cparams.embeddings = params.embedding;
@ -2346,6 +2651,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
return cparams; return cparams;
} }
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
struct ggml_threadpool_params tpp;
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
if (params.mask_valid) {
std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
}
tpp.prio = params.priority;
tpp.poll = params.poll;
tpp.strict_cpu = params.strict_cpu;
return tpp;
}
#ifdef LLAMA_USE_CURL #ifdef LLAMA_USE_CURL
static bool starts_with(const std::string & str, const std::string & prefix) { static bool starts_with(const std::string & str, const std::string & prefix) {
@ -3335,7 +3656,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);

View file

@ -67,13 +67,18 @@ enum dimre_method {
DIMRE_METHOD_MEAN, DIMRE_METHOD_MEAN,
}; };
struct cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};
struct gpt_params { struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -100,6 +105,11 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold float defrag_thold = -1.0f; // KV cache defragmentation threshold
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;
ggml_backend_sched_eval_callback cb_eval = nullptr; ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr; void * cb_eval_user_data = nullptr;
@ -204,7 +214,7 @@ struct gpt_params {
int32_t port = 8080; // server listens on this network port int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; std::string public_path = "";
@ -277,6 +287,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_params_get_system_info(const gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);
// //
// String utils // String utils
// //
@ -329,6 +344,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);

File diff suppressed because it is too large Load diff

View file

@ -3,6 +3,7 @@
from __future__ import annotations from __future__ import annotations
import ast
import logging import logging
import argparse import argparse
import contextlib import contextlib
@ -63,6 +64,7 @@ class Model:
model_name: str | None model_name: str | None
metadata_override: Path | None metadata_override: Path | None
dir_model_card: Path dir_model_card: Path
is_lora: bool
# subclasses should define this! # subclasses should define this!
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
@ -70,7 +72,7 @@ class Model:
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False, use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None, metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
if type(self) is Model: if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
@ -92,6 +94,7 @@ class Model:
self.metadata_override = metadata_override self.metadata_override = metadata_override
self.model_name = model_name self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
if self.ftype == gguf.LlamaFileType.GUESSED: if self.ftype == gguf.LlamaFileType.GUESSED:
@ -296,9 +299,12 @@ class Model:
gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D, gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
gguf.MODEL_TENSOR.TIME_MIX_W1,
gguf.MODEL_TENSOR.TIME_MIX_W2,
) )
) )
or not name.endswith(".weight") or not new_name.endswith(".weight")
): ):
data_qtype = gguf.GGMLQuantizationType.F32 data_qtype = gguf.GGMLQuantizationType.F32
@ -1588,7 +1594,7 @@ class LlamaModel(Model):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3": if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0) base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
factor = rope_scaling.get("factor", 8.0) factor = rope_scaling.get("factor", 8.0)
@ -1611,6 +1617,7 @@ class LlamaModel(Model):
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth)) rope_factors.append(1 / ((1 - smooth) / factor + smooth))
if not self.is_lora:
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
super().prepare_tensors() super().prepare_tensors()
@ -2157,6 +2164,7 @@ class Phi3MiniModel(Model):
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
if not self.is_lora:
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
@ -2729,6 +2737,84 @@ class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2 model_arch = gguf.MODEL_ARCH.STARCODER2
@Model.register("Rwkv6ForCausalLM")
class Rwkv6Model(Model):
model_arch = gguf.MODEL_ARCH.RWKV6
def set_vocab(self):
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
vocab_size = self.hparams.get("vocab_size", 65536)
tokens: list[bytes] = ['<s>'.encode("utf-8")]
toktypes: list[int] = [gguf.TokenType.CONTROL]
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
parts = line.split(' ')
assert len(parts) >= 3
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
token = token.encode("utf-8") if isinstance(token, str) else token
assert isinstance(token, bytes)
assert len(token) == token_len
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
tokens.append(token_text.encode("utf-8"))
toktypes.append(gguf.TokenType.NORMAL)
remainder = vocab_size - len(tokens)
assert remainder >= 0
for i in range(len(tokens), vocab_size):
tokens.append(f"[PAD{i}]".encode("utf-8"))
toktypes.append(gguf.TokenType.UNUSED)
self.gguf_writer.add_tokenizer_model("rwkv")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
head_size = self.hparams["head_size"]
hidden_size = self.hparams["hidden_size"]
layer_norm_eps = self.hparams["layer_norm_epsilon"]
rescale_every_n_layers = self.hparams["rescale_every"]
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
# RWKV isn't context limited
self.gguf_writer.add_context_length(1048576)
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
self.gguf_writer.add_wkv_head_size(head_size)
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype)
# required by llama.cpp, unused
self.gguf_writer.add_head_count(0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
new_name += ".weight"
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
data_torch = data_torch.transpose(0, 1)
if new_name.endswith("time_mix_w2.weight"):
data_torch = data_torch.permute(0, 2, 1)
rescale_every_n_layers = self.hparams["rescale_every"]
if rescale_every_n_layers > 0:
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
yield (new_name, data_torch)
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(Model): class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA model_arch = gguf.MODEL_ARCH.MAMBA
@ -3833,7 +3919,7 @@ class ExaoneModel(Model):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3": if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0) base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
factor = rope_scaling.get("factor", 8.0) factor = rope_scaling.get("factor", 8.0)
@ -3856,6 +3942,7 @@ class ExaoneModel(Model):
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth)) rope_factors.append(1 / ((1 - smooth) / factor + smooth))
if not self.is_lora:
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
super().prepare_tensors() super().prepare_tensors()

View file

@ -386,6 +386,7 @@ if __name__ == '__main__':
dry_run=args.dry_run, dry_run=args.dry_run,
dir_lora_model=dir_lora, dir_lora_model=dir_lora,
lora_alpha=alpha, lora_alpha=alpha,
is_lora=True,
) )
logger.info("Exporting model...") logger.info("Exporting model...")

View file

@ -336,12 +336,12 @@ Choose one of following methods to run.
- Use device 0: - Use device 0:
```sh ```sh
./examples/sycl/run_llama2.sh 0 ./examples/sycl/run-llama2.sh 0
``` ```
- Use multiple devices: - Use multiple devices:
```sh ```sh
./examples/sycl/run_llama2.sh ./examples/sycl/run-llama2.sh
``` ```
2. Command line 2. Command line

View file

@ -20,7 +20,7 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
## Usage ## Usage
@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
The defaults are: The defaults are:
- `CUDA_VERSION` set to `11.7.1` - `CUDA_VERSION` set to `12.6.0`
- `CUDA_DOCKER_ARCH` set to `all` - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
The resulting images, are essentially the same as the non-CUDA images: The resulting images, are essentially the same as the non-CUDA images:

View file

@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
#endif #endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) { if (plan.work_size > 0) {
buf.resize(plan.work_size); buf.resize(plan.work_size);

View file

@ -21,7 +21,7 @@
#endif #endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) { if (plan.work_size > 0) {
buf.resize(plan.work_size); buf.resize(plan.work_size);
@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct { struct benchmark_params_struct {
int32_t n_threads = 1; int n_threads = 1;
int32_t n_iterations = 10; int32_t n_iterations = 10;
}; };

View file

@ -486,7 +486,7 @@ int main(int argc, char ** argv) {
if (use_pca) { if (use_pca) {
// run PCA // run PCA
PCA::pca_params pca_params; PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads; pca_params.n_threads = params.cpuparams.n_threads;
pca_params.n_batch = params.n_pca_batch; pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations; pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);

View file

@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
g_verbose = (params.verbosity == 1); g_verbose = (params.verbosity == 1);
try { try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads); lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge(); ctx.run_merge();
} catch (const std::exception & err) { } catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what()); fprintf(stderr, "%s\n", err.what());

View file

@ -14,7 +14,8 @@ Performance testing tool for llama.cpp.
1. [Markdown](#markdown) 1. [Markdown](#markdown)
2. [CSV](#csv) 2. [CSV](#csv)
3. [JSON](#json) 3. [JSON](#json)
4. [SQL](#sql) 4. [JSONL](#jsonl)
5. [SQL](#sql)
## Syntax ## Syntax
@ -26,13 +27,17 @@ options:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf) -m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512) -p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128) -n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: 512,128) -pg <pp,tg> (default: )
-b, --batch-size <n> (default: 2048) -b, --batch-size <n> (default: 2048)
-ub, --ubatch-size <n> (default: 512) -ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16) -ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16) -ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16) -t, --threads <n> (default: 8)
-C, --cpu-mask <hex,hex> (default: 0x0)
--cpu-strict <0|1> (default: 0)
--poll <0...100> (default: 50)
-ngl, --n-gpu-layers <n> (default: 99) -ngl, --n-gpu-layers <n> (default: 99)
-rpc, --rpc <rpc_servers> (default: )
-sm, --split-mode <none|layer|row> (default: layer) -sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0) -mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0)
@ -42,7 +47,10 @@ options:
-embd, --embeddings <0|1> (default: 0) -embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0) -ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5) -r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md) --prio <0|1|2|3> (default: 0)
--delay <0...N> (seconds) (default: 0)
-o, --output <csv|json|jsonl|md|sql> (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
-v, --verbose (default: 0) -v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
@ -238,6 +246,19 @@ $ ./llama-bench -o json
] ]
``` ```
### JSONL
```sh
$ ./llama-bench -o jsonl
```
```json lines
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
```
### SQL ### SQL
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database. SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.

View file

@ -16,6 +16,7 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <thread>
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
@ -170,13 +171,14 @@ static std::string get_gpu_info() {
} }
// command line params // command line params
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL}; enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) { static const char * output_format_str(output_formats format) {
switch (format) { switch (format) {
case NONE: return "none"; case NONE: return "none";
case CSV: return "csv"; case CSV: return "csv";
case JSON: return "json"; case JSON: return "json";
case JSONL: return "jsonl";
case MARKDOWN: return "md"; case MARKDOWN: return "md";
case SQL: return "sql"; case SQL: return "sql";
default: GGML_ABORT("invalid output format"); default: GGML_ABORT("invalid output format");
@ -190,6 +192,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
format = CSV; format = CSV;
} else if (s == "json") { } else if (s == "json") {
format = JSON; format = JSON;
} else if (s == "jsonl") {
format = JSONL;
} else if (s == "md") { } else if (s == "md") {
format = MARKDOWN; format = MARKDOWN;
} else if (s == "sql") { } else if (s == "sql") {
@ -225,6 +229,9 @@ struct cmd_params {
std::vector<ggml_type> type_k; std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v; std::vector<ggml_type> type_v;
std::vector<int> n_threads; std::vector<int> n_threads;
std::vector<std::string> cpu_mask;
std::vector<bool> cpu_strict;
std::vector<int> poll;
std::vector<int> n_gpu_layers; std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers; std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode; std::vector<llama_split_mode> split_mode;
@ -236,6 +243,8 @@ struct cmd_params {
std::vector<bool> embeddings; std::vector<bool> embeddings;
ggml_numa_strategy numa; ggml_numa_strategy numa;
int reps; int reps;
ggml_sched_priority prio;
int delay;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
output_formats output_format_stderr; output_formats output_format_stderr;
@ -251,6 +260,9 @@ static const cmd_params cmd_params_defaults = {
/* type_k */ {GGML_TYPE_F16}, /* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()}, /* n_threads */ {cpu_get_num_math()},
/* cpu_mask */ {"0x0"},
/* cpu_strict */ {false},
/* poll */ {50},
/* n_gpu_layers */ {99}, /* n_gpu_layers */ {99},
/* rpc_servers */ {""}, /* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@ -262,6 +274,8 @@ static const cmd_params cmd_params_defaults = {
/* embeddings */ {false}, /* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED, /* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5, /* reps */ 5,
/* prio */ GGML_SCHED_PRIO_NORMAL,
/* delay */ 0,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN, /* output_format */ MARKDOWN,
/* output_format_stderr */ NONE, /* output_format_stderr */ NONE,
@ -281,8 +295,13 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
#ifdef GGML_USE_RPC
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
#endif
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@ -292,8 +311,10 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n"); printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@ -338,6 +359,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps; params.reps = cmd_params_defaults.reps;
params.numa = cmd_params_defaults.numa; params.numa = cmd_params_defaults.numa;
params.prio = cmd_params_defaults.prio;
params.delay = cmd_params_defaults.delay;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
arg = argv[i]; arg = argv[i];
@ -433,6 +456,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = string_split<int>(argv[i], split_delim); auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
} else if (arg == "--cpu-strict") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
} else if (arg == "--poll") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.poll.insert(params.poll.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") { } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -440,12 +484,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = string_split<int>(argv[i], split_delim); auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
#ifdef GGML_USE_RPC
} else if (arg == "-rpc" || arg == "--rpc") { } else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.rpc_servers.push_back(argv[i]); params.rpc_servers.push_back(argv[i]);
#endif
} else if (arg == "-sm" || arg == "--split-mode") { } else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -541,6 +587,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break; break;
} }
params.reps = std::stoi(argv[i]); params.reps = std::stoi(argv[i]);
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
} else if (arg == "--delay") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.delay = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") { } else if (arg == "-o" || arg == "--output") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -585,6 +643,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
return params; return params;
} }
@ -598,6 +659,9 @@ struct cmd_params_instance {
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
int n_threads; int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
int n_gpu_layers; int n_gpu_layers;
std::string rpc_servers; std::string rpc_servers;
llama_split_mode split_mode; llama_split_mode split_mode;
@ -667,7 +731,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload) for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn) for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) { for (const auto & nt : params.n_threads)
for (const auto & cm : params.cpu_mask)
for (const auto & cs : params.cpu_strict)
for (const auto & pl : params.poll) {
for (const auto & n_prompt : params.n_prompt) { for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) { if (n_prompt == 0) {
continue; continue;
@ -681,6 +748,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -707,6 +777,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -733,6 +806,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -769,6 +845,9 @@ struct test {
int n_batch; int n_batch;
int n_ubatch; int n_ubatch;
int n_threads; int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
bool has_rpc; bool has_rpc;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
@ -795,6 +874,9 @@ struct test {
n_batch = inst.n_batch; n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch; n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads; n_threads = inst.n_threads;
cpu_mask = inst.cpu_mask;
cpu_strict = inst.cpu_strict;
poll = inst.poll;
has_rpc = !inst.rpc_servers.empty(); has_rpc = !inst.rpc_servers.empty();
type_k = inst.type_k; type_k = inst.type_k;
type_v = inst.type_v; type_v = inst.type_v;
@ -872,13 +954,14 @@ struct test {
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch", "n_batch", "n_ubatch",
"n_threads", "type_k", "type_v", "n_threads", "cpu_mask", "cpu_strict", "poll",
"type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings", "tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
"avg_ts", "stddev_ts" "avg_ts", "stddev_ts",
}; };
return fields; return fields;
} }
@ -887,7 +970,7 @@ struct test {
static field_type get_field_type(const std::string & field) { static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
field == "n_threads" || field == "n_threads" || field == "poll" ||
field == "model_size" || field == "model_n_params" || field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" || field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" || field == "n_prompt" || field == "n_gen" ||
@ -896,6 +979,7 @@ struct test {
} }
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "cpu_strict" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") { field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
} }
@ -928,7 +1012,8 @@ struct test {
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@ -996,8 +1081,6 @@ struct csv_printer : public printer {
} }
}; };
struct json_printer : public printer {
bool first = true;
static std::string escape_json(const std::string & value) { static std::string escape_json(const std::string & value) {
std::string escaped; std::string escaped;
@ -1017,7 +1100,7 @@ struct json_printer : public printer {
return escaped; return escaped;
} }
static std::string format_value(const std::string & field, const std::string & value) { static std::string format_json_value(const std::string & field, const std::string & value) {
switch (test::get_field_type(field)) { switch (test::get_field_type(field)) {
case test::STRING: case test::STRING:
return "\"" + escape_json(value) + "\""; return "\"" + escape_json(value) + "\"";
@ -1028,6 +1111,9 @@ struct json_printer : public printer {
} }
} }
struct json_printer : public printer {
bool first = true;
void print_header(const cmd_params & params) override { void print_header(const cmd_params & params) override {
fprintf(fout, "[\n"); fprintf(fout, "[\n");
(void) params; (void) params;
@ -1036,7 +1122,7 @@ struct json_printer : public printer {
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) { void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size()); assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) { for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str()); fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
} }
} }
@ -1059,6 +1145,25 @@ struct json_printer : public printer {
} }
}; };
struct jsonl_printer : public printer {
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
}
}
void print_test(const test & t) override {
fprintf(fout, "{");
print_fields(test::get_fields(), t.get_values());
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
fprintf(fout, "}\n");
fflush(fout);
}
};
struct markdown_printer : public printer { struct markdown_printer : public printer {
std::vector<std::string> fields; std::vector<std::string> fields;
@ -1067,7 +1172,7 @@ struct markdown_printer : public printer {
return -30; return -30;
} }
if (field == "t/s") { if (field == "t/s") {
return 16; return 20;
} }
if (field == "size" || field == "params") { if (field == "size" || field == "params") {
return 10; return 10;
@ -1149,6 +1254,15 @@ struct markdown_printer : public printer {
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.emplace_back("n_threads"); fields.emplace_back("n_threads");
} }
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
fields.emplace_back("cpu_mask");
}
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
fields.emplace_back("cpu_strict");
}
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
fields.emplace_back("poll");
}
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch"); fields.emplace_back("n_batch");
} }
@ -1350,6 +1464,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
return std::unique_ptr<printer>(new csv_printer()); return std::unique_ptr<printer>(new csv_printer());
case JSON: case JSON:
return std::unique_ptr<printer>(new json_printer()); return std::unique_ptr<printer>(new json_printer());
case JSONL:
return std::unique_ptr<printer>(new jsonl_printer());
case MARKDOWN: case MARKDOWN:
return std::unique_ptr<printer>(new markdown_printer()); return std::unique_ptr<printer>(new markdown_printer());
case SQL: case SQL:
@ -1383,6 +1499,8 @@ int main(int argc, char ** argv) {
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
set_process_priority(params.prio);
// initialize printer // initialize printer
std::unique_ptr<printer> p = create_printer(params.output_format); std::unique_ptr<printer> p = create_printer(params.output_format);
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr); std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@ -1428,6 +1546,28 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
// cool off before the test
if (params.delay) {
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
}
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
exit(1);
}
tpp.strict_cpu = t.cpu_strict;
tpp.poll = t.poll;
tpp.prio = params.prio;
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool, NULL);
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
@ -1466,6 +1606,8 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);
ggml_threadpool_free(threadpool);
} }
llama_free_model(lmodel); llama_free_model(lmodel);

View file

@ -71,8 +71,8 @@ actor LlamaContext {
var ctx_params = llama_context_default_params() var ctx_params = llama_context_default_params()
ctx_params.seed = 1234 ctx_params.seed = 1234
ctx_params.n_ctx = 2048 ctx_params.n_ctx = 2048
ctx_params.n_threads = UInt32(n_threads) ctx_params.n_threads = Int32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads) ctx_params.n_threads_batch = Int32(n_threads)
let context = llama_new_context_with_model(model, ctx_params) let context = llama_new_context_with_model(model, ctx_params)
guard let context else { guard let context else {

View file

@ -15,8 +15,8 @@ cd llama.cpp
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash ```bash
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2 python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version # quantize int4 version

View file

@ -216,13 +216,19 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
static void replace_all(std::string & s, const std::string & search, const std::string & replace) { static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) { if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string return;
} }
std::string builder;
builder.reserve(s.length());
size_t pos = 0; size_t pos = 0;
while ((pos = s.find(search, pos)) != std::string::npos) { size_t last_pos = 0;
s.replace(pos, search.length(), replace); while ((pos = s.find(search, last_pos)) != std::string::npos) {
pos += replace.length(); builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
} }
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
} }
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@ -1617,7 +1623,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
} }
} }
inline float clip(float x, float lower, float upper) { inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper)); return std::max(lower, std::min(x, upper));
} }
@ -1821,10 +1827,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
return refine_size; return refine_size;
} }
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums; std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) { for (int i : {multiple - 1, multiple, multiple + 1}) {

View file

@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
if (!params->image.empty()) { if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n"); LOG_TEE("using base64 encoded image instead of command line image path\n");
} }
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) { if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__); LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL; return NULL;
} }
params->prompt = remove_image_from_prompt(prompt); params->prompt = remove_image_from_prompt(prompt);
} else { } else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str()); embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) { if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL; return NULL;

View file

@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params); auto ctx_clip = clip_init_context(params);
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) { if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
return NULL; return NULL;

View file

@ -221,6 +221,40 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
LOG("%s: llama threadpool init = n_threads = %d\n",
__func__,
(int) params.cpuparams.n_threads
);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams);
set_process_priority(params.cpuparams.priority);
struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch);
if (!threadpool_batch) {
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
exit(1);
}
// Start the non-batch threadpool in the paused state
tpp.paused = true;
}
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
if (ctx_guidance) {
llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
}
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx); LOG("n_ctx: %d\n", n_ctx);
@ -352,8 +386,8 @@ int main(int argc, char ** argv) {
} }
LOGLN( LOGLN(
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu", "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force // if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token to recalculate the cached logits // reevaluation of the last token to recalculate the cached logits
@ -989,6 +1023,9 @@ int main(int argc, char ** argv) {
llama_sampling_free(ctx_sampling); llama_sampling_free(ctx_sampling);
llama_backend_free(); llama_backend_free();
ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch);
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n"); LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS

View file

@ -106,7 +106,7 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --keep-split: will generate quatized model in the same shards as input"); printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n");

View file

@ -249,23 +249,49 @@ logging:
Available environment variables (if specified, these variables will override parameters specified in arguments): Available environment variables (if specified, these variables will override parameters specified in arguments):
- `LLAMA_CACHE` (cache directory, used by `--hf-repo`) - `LLAMA_CACHE`: cache directory, used by `--hf-repo`
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`) - `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
- `LLAMA_ARG_MODEL` - `LLAMA_ARG_MODEL`: equivalent to `-m`
- `LLAMA_ARG_THREADS` - `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
- `LLAMA_ARG_CTX_SIZE` - `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
- `LLAMA_ARG_N_PARALLEL` - `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
- `LLAMA_ARG_BATCH` - `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
- `LLAMA_ARG_UBATCH` - `LLAMA_ARG_THREADS`: equivalent to `-t`
- `LLAMA_ARG_N_GPU_LAYERS` - `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
- `LLAMA_ARG_THREADS_HTTP` - `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
- `LLAMA_ARG_CHAT_TEMPLATE` - `LLAMA_ARG_BATCH`: equivalent to `-b`
- `LLAMA_ARG_N_PREDICT` - `LLAMA_ARG_UBATCH`: equivalent to `-ub`
- `LLAMA_ARG_ENDPOINT_METRICS` - `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
- `LLAMA_ARG_ENDPOINT_SLOTS` - `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
- `LLAMA_ARG_EMBEDDINGS` - `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
- `LLAMA_ARG_FLASH_ATTN` - `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
- `LLAMA_ARG_DEFRAG_THOLD` - `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
- `LLAMA_ARG_HOST`: equivalent to `--host`
- `LLAMA_ARG_PORT`: equivalent to `--port`
Example usage of docker compose with environment variables:
```yml
services:
llamacpp-server:
image: ghcr.io/ggerganov/llama.cpp:server
ports:
- 8080:8080
volumes:
- ./models:/models
environment:
# alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
LLAMA_ARG_MODEL: /models/my_model.gguf
LLAMA_ARG_CTX_SIZE: 4096
LLAMA_ARG_N_PARALLEL: 2
LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0
LLAMA_ARG_PORT: 8080
```
## Build ## Build

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -23,6 +23,8 @@ from prometheus_client import parser
# pyright: reportRedeclaration=false # pyright: reportRedeclaration=false
DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
@step("a server listening on {server_fqdn}:{server_port}") @step("a server listening on {server_fqdn}:{server_port}")
def step_server_config(context, server_fqdn: str, server_port: str): def step_server_config(context, server_fqdn: str, server_port: str):
context.server_fqdn = server_fqdn context.server_fqdn = server_fqdn
@ -689,7 +691,7 @@ def step_tokenize_set_add_special(context):
@async_run_until_complete @async_run_until_complete
async def step_tokenize(context): async def step_tokenize(context):
context.tokenized_text = context_text(context) context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
tokenize_args = { tokenize_args = {
"content": context.tokenized_text, "content": context.tokenized_text,
} }
@ -706,7 +708,7 @@ async def step_tokenize(context):
@async_run_until_complete @async_run_until_complete
async def step_detokenize(context): async def step_detokenize(context):
assert len(context.tokens) > 0 assert len(context.tokens) > 0
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{context.base_url}/detokenize', async with session.post(f'{context.base_url}/detokenize',
json={ json={
"tokens": context.tokens, "tokens": context.tokens,
@ -735,7 +737,7 @@ def step_strings_for_tokenization(context):
@step('an OPTIONS request is sent from {origin}') @step('an OPTIONS request is sent from {origin}')
@async_run_until_complete @async_run_until_complete
async def step_options_request(context, origin): async def step_options_request(context, origin):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin} headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin}
async with session.options(f'{context.base_url}/v1/chat/completions', async with session.options(f'{context.base_url}/v1/chat/completions',
headers=headers) as response: headers=headers) as response:
@ -751,7 +753,7 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
@step('prometheus metrics are exposed') @step('prometheus metrics are exposed')
@async_run_until_complete @async_run_until_complete
async def step_prometheus_metrics_exported(context): async def step_prometheus_metrics_exported(context):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with await session.get(f'{context.base_url}/metrics') as metrics_response: async with await session.get(f'{context.base_url}/metrics') as metrics_response:
assert metrics_response.status == 200 assert metrics_response.status == 200
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
@ -818,13 +820,13 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
for prompt_no in range(context.n_prompts): for prompt_no in range(context.n_prompts):
shifted_args = [context.prompts.pop(), seeds[prompt_no], *args] shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
await asyncio.sleep(0.1) await asyncio.sleep(0.01)
@step('the slot {slot_id:d} is saved with filename "{filename}"') @step('the slot {slot_id:d} is saved with filename "{filename}"')
@async_run_until_complete @async_run_until_complete
async def step_save_slot(context, slot_id, filename): async def step_save_slot(context, slot_id, filename):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{context.base_url}/slots/{slot_id}?action=save', async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
json={"filename": filename}, json={"filename": filename},
headers={"Content-Type": "application/json"}) as response: headers={"Content-Type": "application/json"}) as response:
@ -834,7 +836,7 @@ async def step_save_slot(context, slot_id, filename):
@step('the slot {slot_id:d} is restored with filename "{filename}"') @step('the slot {slot_id:d} is restored with filename "{filename}"')
@async_run_until_complete @async_run_until_complete
async def step_restore_slot(context, slot_id, filename): async def step_restore_slot(context, slot_id, filename):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore', async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
json={"filename": filename}, json={"filename": filename},
headers={"Content-Type": "application/json"}) as response: headers={"Content-Type": "application/json"}) as response:
@ -844,7 +846,7 @@ async def step_restore_slot(context, slot_id, filename):
@step('the slot {slot_id:d} is erased') @step('the slot {slot_id:d} is erased')
@async_run_until_complete @async_run_until_complete
async def step_erase_slot(context, slot_id): async def step_erase_slot(context, slot_id):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase', async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
headers={"Content-Type": "application/json"}) as response: headers={"Content-Type": "application/json"}) as response:
context.response = response context.response = response
@ -853,7 +855,7 @@ async def step_erase_slot(context, slot_id):
@step('switch {on_or_off} lora adapter {lora_id:d}') @step('switch {on_or_off} lora adapter {lora_id:d}')
@async_run_until_complete @async_run_until_complete
async def toggle_lora_adapter(context, on_or_off: str, lora_id: int): async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{context.base_url}/lora-adapters', async with session.post(f'{context.base_url}/lora-adapters',
json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}], json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
headers={"Content-Type": "application/json"}) as response: headers={"Content-Type": "application/json"}) as response:
@ -889,7 +891,7 @@ async def request_completion(prompt,
print(f"Set user_api_key: {user_api_key}") print(f"Set user_api_key: {user_api_key}")
headers['Authorization'] = f'Bearer {user_api_key}' headers['Authorization'] = f'Bearer {user_api_key}'
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{base_url}/completion', async with session.post(f'{base_url}/completion',
json={ json={
"input_prefix": prompt_prefix, "input_prefix": prompt_prefix,
@ -902,8 +904,7 @@ async def request_completion(prompt,
"temperature": temperature if temperature is not None else 0.8, "temperature": temperature if temperature is not None else 0.8,
"n_probs": 2, "n_probs": 2,
}, },
headers=headers, headers=headers) as response:
timeout=3600) as response:
if expect_api_error is None or not expect_api_error: if expect_api_error is None or not expect_api_error:
assert response.status == 200 assert response.status == 200
assert response.headers['Access-Control-Allow-Origin'] == origin assert response.headers['Access-Control-Allow-Origin'] == origin
@ -961,7 +962,7 @@ async def oai_chat_completions(user_prompt,
if async_client: if async_client:
origin = 'llama.cpp' origin = 'llama.cpp'
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{base_url}{base_path}', async with session.post(f'{base_url}{base_path}',
json=payload, json=payload,
headers=headers) as response: headers=headers) as response:
@ -1048,7 +1049,7 @@ async def oai_chat_completions(user_prompt,
async def request_embedding(content, seed, base_url=None) -> list[list[float]]: async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{base_url}/embedding', async with session.post(f'{base_url}/embedding',
json={ json={
"content": content, "content": content,
@ -1068,14 +1069,13 @@ async def request_oai_embeddings(input, seed,
headers=[] headers=[]
if user_api_key is not None: if user_api_key is not None:
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with session.post(f'{base_url}/v1/embeddings', async with session.post(f'{base_url}/v1/embeddings',
json={ json={
"input": input, "input": input,
"model": model, "model": model,
}, },
headers=headers, headers=headers) as response:
timeout=3600) as response:
assert response.status == 200, f"received status code not expected: {response.status}" assert response.status == 200, f"received status code not expected: {response.status}"
assert response.headers['Access-Control-Allow-Origin'] == origin assert response.headers['Access-Control-Allow-Origin'] == origin
assert response.headers['Content-Type'] == "application/json; charset=utf-8" assert response.headers['Content-Type'] == "application/json; charset=utf-8"
@ -1194,7 +1194,7 @@ async def wait_for_slots_status(context,
if 'GITHUB_ACTIONS' in os.environ: if 'GITHUB_ACTIONS' in os.environ:
timeout *= 2 timeout *= 2
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
while True: while True:
async with await session.get(f'{base_url}/slots', params=params) as slots_response: async with await session.get(f'{base_url}/slots', params=params) as slots_response:
status_code = slots_response.status status_code = slots_response.status
@ -1237,7 +1237,7 @@ def assert_embeddings(embeddings):
async def request_slots_status(context, expected_slots): async def request_slots_status(context, expected_slots):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
async with await session.get(f'{context.base_url}/slots') as slots_response: async with await session.get(f'{context.base_url}/slots') as slots_response:
assert slots_response.status == 200 assert slots_response.status == 200
slots = await slots_response.json() slots = await slots_response.json()

View file

@ -8,9 +8,12 @@ Feature: Wrong usage of llama.cpp server
Scenario: Infinite loop Scenario: Infinite loop
Given a server listening on localhost:8080 Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And 42 as server seed
And 2048 KV cache size
# Uncomment below to fix the issue # Uncomment below to fix the issue
#And 64 server max tokens to predict #And 64 server max tokens to predict
Then the server is starting Then the server is starting
Then the server is healthy
Given a prompt: Given a prompt:
""" """
Go to: infinite loop Go to: infinite loop

View file

@ -3,6 +3,14 @@
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
#define CPPHTTPLIB_NO_EXCEPTIONS 1
#endif
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
#include "httplib.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT: // Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT #define JSON_ASSERT GGML_ASSERT
#include "json.hpp" #include "json.hpp"
@ -279,6 +287,18 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
return std::string::npos; return std::string::npos;
} }
static bool json_is_array_of_numbers(json data) {
if (data.is_array()) {
for (const auto & e : data) {
if (!e.is_number()) {
return false;
}
}
return true;
}
return false;
}
// TODO: reuse llama_detokenize // TODO: reuse llama_detokenize
template <class Iter> template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
@ -343,6 +363,19 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
return out; return out;
} }
static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
const std::string str =
std::string(event) + ": " +
data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {
{ "to_send", str }
});
return sink.write(str.c_str(), str.size());
}
// //
// OAI utils // OAI utils
// //

View file

@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
// load the draft model // load the draft model
params.model = params.model_draft; params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft; params.n_gpu_layers = params.n_gpu_layers_draft;
if (params.n_threads_draft > 0) { if (params.draft_cpuparams.n_threads > 0) {
params.n_threads = params.n_threads_draft; params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
} }
params.n_threads_batch = params.n_threads_batch_draft;
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
llama_init_result llama_init_dft = llama_init_from_gpt_params(params); llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
model_dft = llama_init_dft.model; model_dft = llama_init_dft.model;
ctx_dft = llama_init_dft.context; ctx_dft = llama_init_dft.context;

12
flake.lock generated
View file

@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib" "nixpkgs-lib": "nixpkgs-lib"
}, },
"locked": { "locked": {
"lastModified": 1722555600, "lastModified": 1725024810,
"narHash": "sha256-XOQkdLafnb/p9ij77byFQjDf5m5QYl9b2REiVClC+x4=", "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
"owner": "hercules-ci", "owner": "hercules-ci",
"repo": "flake-parts", "repo": "flake-parts",
"rev": "8471fe90ad337a8074e957b69ca4d0089218391d", "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1723637854, "lastModified": 1724819573,
"narHash": "sha256-med8+5DSWa2UnOqtdICndjDAEjxr5D7zaIiK4pn0Q7c=", "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "c3aa7b8938b17aebd2deecf7be0636000d62a2b9", "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -145,7 +145,9 @@
# the same path you would with an overlay. # the same path you would with an overlay.
legacyPackages = { legacyPackages = {
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix {
inherit llamaVersion;
};
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
}; };
@ -157,6 +159,7 @@
default = config.legacyPackages.llamaPackages.llama-cpp; default = config.legacyPackages.llamaPackages.llama-cpp;
vulkan = config.packages.default.override { useVulkan = true; }; vulkan = config.packages.default.override { useVulkan = true; };
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp; windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
python-scripts = config.legacyPackages.llamaPackages.python-scripts;
} }
// lib.optionalAttrs pkgs.stdenv.isLinux { // lib.optionalAttrs pkgs.stdenv.isLinux {
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp; cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;

View file

@ -63,6 +63,7 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset of the tensor data for setting/getting data
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
@ -102,6 +103,7 @@ extern "C" {
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer // Create a backend buffer from an existing pointer

View file

@ -220,7 +220,7 @@
#include <stdio.h> #include <stdio.h>
#define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1 #define GGML_FILE_VERSION 2
#define GGML_QNT_VERSION 2 // bump this on quantization format changes #define GGML_QNT_VERSION 2 // bump this on quantization format changes
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
@ -231,6 +231,8 @@
#define GGML_MAX_SRC 10 #define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME #ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 64 #define GGML_MAX_NAME 64
#define GGML_MAX_N_THREADS 512
#endif #endif
#define GGML_MAX_OP_PARAMS 64 #define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
@ -455,6 +457,8 @@ extern "C" {
GGML_OP_SQR, GGML_OP_SQR,
GGML_OP_SQRT, GGML_OP_SQRT,
GGML_OP_LOG, GGML_OP_LOG,
GGML_OP_SIN,
GGML_OP_COS,
GGML_OP_SUM, GGML_OP_SUM,
GGML_OP_SUM_ROWS, GGML_OP_SUM_ROWS,
GGML_OP_MEAN, GGML_OP_MEAN,
@ -492,9 +496,11 @@ extern "C" {
GGML_OP_CLAMP, GGML_OP_CLAMP,
GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL, GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D, GGML_OP_POOL_1D,
GGML_OP_POOL_2D, GGML_OP_POOL_2D,
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_ARANGE, GGML_OP_ARANGE,
@ -510,6 +516,7 @@ extern "C" {
GGML_OP_WIN_UNPART, GGML_OP_WIN_UNPART,
GGML_OP_GET_REL_POS, GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS, GGML_OP_ADD_REL_POS,
GGML_OP_RWKV_WKV,
GGML_OP_UNARY, GGML_OP_UNARY,
@ -544,6 +551,7 @@ extern "C" {
GGML_UNARY_OP_SILU, GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH, GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID, GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_EXP,
GGML_UNARY_OP_COUNT, GGML_UNARY_OP_COUNT,
}; };
@ -626,6 +634,29 @@ extern "C" {
// If it returns true, the computation is aborted // If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data); typedef bool (*ggml_abort_callback)(void * data);
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan { struct ggml_cplan {
@ -633,6 +664,7 @@ extern "C" {
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads; int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true // abort ggml_graph_compute when true
ggml_abort_callback abort_callback; ggml_abort_callback abort_callback;
@ -971,6 +1003,22 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return scalar // return scalar
GGML_API struct ggml_tensor * ggml_sum( GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1121,6 +1169,14 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_exp(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_exp_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows // normalize along rows
GGML_API struct ggml_tensor * ggml_norm( GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1568,34 +1624,49 @@ extern "C" {
float min, float min,
float max); float max);
// im2col
// converts data into a format that effectively results in a convolution when combined with matrix multiplication
GGML_API struct ggml_tensor * ggml_im2col( GGML_API struct ggml_tensor * ggml_im2col(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, struct ggml_tensor * b, // data
int s0, int s0, // stride dimension 0
int s1, int s1, // stride dimension 1
int p0, int p0, // padding dimension 0
int p1, int p1, // padding dimension 1
int d0, int d0, // dilation dimension 0
int d1, int d1, // dilation dimension 1
bool is_2D, bool is_2D,
enum ggml_type dst_type); enum ggml_type dst_type);
GGML_API struct ggml_tensor * ggml_im2col_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // gradient of im2col output
int64_t * ne, // shape of im2col input
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1, // dilation dimension 1
bool is_2D);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d( GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, struct ggml_tensor * b, // data
int s0, int s0, // stride dimension 0
int s1, int s1, // stride dimension 1
int p0, int p0, // padding dimension 0
int p1, int p1, // padding dimension 1
int d0, int d0, // dilation dimension 0
int d1); int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_1d( GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, struct ggml_tensor * b, // data
int s0, // stride int s0, // stride
int p0, // padding int p0, // padding
int d0); // dilation int d0); // dilation
@ -1604,29 +1675,29 @@ extern "C" {
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
GGML_API struct ggml_tensor* ggml_conv_1d_ph( GGML_API struct ggml_tensor* ggml_conv_1d_ph(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, struct ggml_tensor * b, // data
int s, int s, // stride
int d); int d); // dilation
GGML_API struct ggml_tensor * ggml_conv_transpose_1d( GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, struct ggml_tensor * b, // data
int s0, int s0, // stride
int p0, int p0, // padding
int d0); int d0); // dilation
GGML_API struct ggml_tensor * ggml_conv_2d( GGML_API struct ggml_tensor * ggml_conv_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, struct ggml_tensor * b, // data
int s0, int s0, // stride dimension 0
int s1, int s1, // stride dimension 1
int p0, int p0, // padding dimension 0
int p1, int p1, // padding dimension 1
int d0, int d0, // dilation dimension 0
int d1); int d1); // dilation dimension 1
// kernel size is a->ne[0] x a->ne[1] // kernel size is a->ne[0] x a->ne[1]
@ -1688,6 +1759,18 @@ extern "C" {
float p0, float p0,
float p1); float p1);
GGML_API struct ggml_tensor * ggml_pool_2d_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * af, // "a"/input used in forward pass
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
float p0,
float p1);
// nearest interpolate // nearest interpolate
// multiplies ne0 and ne1 by scale factor // multiplies ne0 and ne1 by scale factor
// used in stable-diffusion // used in stable-diffusion
@ -1762,7 +1845,8 @@ extern "C" {
struct ggml_tensor * v, struct ggml_tensor * v,
struct ggml_tensor * mask, struct ggml_tensor * mask,
float scale, float scale,
float max_bias); float max_bias,
float logit_softcap);
GGML_API void ggml_flash_attn_ext_set_prec( GGML_API void ggml_flash_attn_ext_set_prec(
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1841,6 +1925,15 @@ extern "C" {
struct ggml_tensor * pw, struct ggml_tensor * pw,
struct ggml_tensor * ph); struct ggml_tensor * ph);
GGML_API struct ggml_tensor * ggml_rwkv_wkv(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * r,
struct ggml_tensor * tf,
struct ggml_tensor * td,
struct ggml_tensor * state);
// custom operators // custom operators
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@ -2011,10 +2104,23 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute() // ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data // when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context // same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

View file

@ -1247,7 +1247,7 @@ endif()
# Data types, macros and functions related to controlling CPU affinity and # Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc # some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux") if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
add_compile_definitions(_GNU_SOURCE) add_compile_definitions(_GNU_SOURCE)
endif() endif()

View file

@ -36,6 +36,84 @@
// from bias offset form to pure sign form (this saves subtract // from bias offset form to pure sign form (this saves subtract
// operations durin unpacking) // operations durin unpacking)
// //
#if defined(__AVX__)
#if defined(__F16C__)
// the _mm256_cvt intrinsics require F16C
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask) _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
#else
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
float tmp[8];
for (int i = 0; i < 8; i++) {
tmp[i] = GGML_FP16_TO_FP32(x[i]);
}
return _mm256_loadu_ps(tmp);
}
static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
float tmp[8];
for (int i = 0; i < 4; i++) {
tmp[i] = GGML_FP16_TO_FP32(x[i]);
tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
}
return _mm256_loadu_ps(tmp);
}
static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrangeMask) {
uint16_t tmphalf[8];
float tmp[8];
_mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
for (int i = 0; i < 8; i++) {
tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
}
return _mm256_loadu_ps(tmp);
}
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask) __avx_repeat_f32cx8_load(x)
#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) __avx_rearranged_f32cx8_load(x, arrangeMask)
#endif
#endif
#if defined(__AVX2__) || defined(__AVX512F__)
static inline __m256i sum_i16_pairs_int(const __m256i x) {
const __m256i ones = _mm256_set1_epi16(1);
return _mm256_madd_epi16(ones, x);
}
static inline __m256i mul_sum_us8_pairs_int(const __m256i ax, const __m256i sy) {
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
const __m256i zero = _mm256_setzero_si256();
return _mm256_dpbusd_epi32(zero, ax, sy);
#else
// Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
return sum_i16_pairs_int(dot);
#endif
}
// Integer variant of the function defined in ggml-quants.c
// multiply int8_t, add results pairwise twice and return as float vector
static inline __m256i mul_sum_i8_pairs_int(const __m256i x, const __m256i y) {
#if __AVXVNNIINT8__
const __m256i zero = _mm256_setzero_si256();
return _mm256_dpbssd_epi32(zero, x, y);
#else
// Get absolute values of x vectors
const __m256i ax = _mm256_sign_epi8(x, x);
// Sign the values of the y vectors
const __m256i sy = _mm256_sign_epi8(y, x);
return mul_sum_us8_pairs_int(ax, sy);
#endif
}
#endif
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) { static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
block_q4_0x4 out; block_q4_0x4 out;
@ -255,6 +333,103 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3); y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
} }
} }
#elif defined(__AVX2__) || defined(__AVX__)
float id[4];
__m256 srcv[4][4];
__m256 idvec[4];
for (int i = 0; i < nb; i++) {
for (int row_iter = 0; row_iter < 4; row_iter++) {
// Load elements into 4 AVX vectors
__m256 v0 = _mm256_loadu_ps( x + row_iter * k + i * 32 );
__m256 v1 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 8 );
__m256 v2 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 16 );
__m256 v3 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 24 );
// Compute max(abs(e)) for the block
const __m256 signBit = _mm256_set1_ps( -0.0f );
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
const float maxScalar = _mm_cvtss_f32( max4 );
// Divided by 127.f to mirror results in quantize_row_q8_0
const float d = maxScalar / 127.f;
id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
// Store the scale for the individual block
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
// Store the values in blocks of eight values - Aim is to use these later for block interleaving
srcv[row_iter][0] = v0;
srcv[row_iter][1] = v1;
srcv[row_iter][2] = v2;
srcv[row_iter][3] = v3;
idvec[row_iter] = _mm256_set1_ps(id[row_iter]);
}
// The loop iterates four times - The aim is to get 4 corresponding chunks of eight bytes from the original weight blocks that are interleaved
for (int j = 0; j < 4; j++) {
// Apply the multiplier
__m256 v0 = _mm256_mul_ps(srcv[0][j], idvec[0]);
__m256 v1 = _mm256_mul_ps(srcv[1][j], idvec[1]);
__m256 v2 = _mm256_mul_ps(srcv[2][j], idvec[2]);
__m256 v3 = _mm256_mul_ps(srcv[3][j], idvec[3]);
// Round to nearest integer
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
// Convert floats to integers
__m256i i0 = _mm256_cvtps_epi32( v0 );
__m256i i1 = _mm256_cvtps_epi32( v1 );
__m256i i2 = _mm256_cvtps_epi32( v2 );
__m256i i3 = _mm256_cvtps_epi32( v3 );
#if defined(__AVX2__)
// Convert int32 to int16
i0 = _mm256_packs_epi32( i0, i1 );
i2 = _mm256_packs_epi32( i2, i3 );
// Convert int16 to int8
i0 = _mm256_packs_epi16( i0, i2 );
// Permute and store the quantized weights in the required order after the pack instruction
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
i0 = _mm256_permutevar8x32_epi32( i0, perm );
_mm256_storeu_si256((__m256i *)(y[i].qs + 32 * j), i0);
#else
// Since we don't have in AVX some necessary functions,
// we split the registers in half and call AVX2 analogs from SSE
__m128i ni0 = _mm256_castsi256_si128( i0 );
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
__m128i ni2 = _mm256_castsi256_si128( i1 );
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
__m128i ni4 = _mm256_castsi256_si128( i2 );
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
__m128i ni6 = _mm256_castsi256_si128( i3 );
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
// Convert int32 to int16
ni0 = _mm_packs_epi32( ni0, ni1 );
ni2 = _mm_packs_epi32( ni2, ni3 );
ni4 = _mm_packs_epi32( ni4, ni5 );
ni6 = _mm_packs_epi32( ni6, ni7 );
// Convert int16 to int8
ni0 = _mm_packs_epi16( ni0, ni2 );
ni4 = _mm_packs_epi16( ni4, ni6 );
_mm_storeu_si128((__m128i *)(y[i].qs + 32 * j), ni0);
_mm_storeu_si128((__m128i *)(y[i].qs + 32 * j + 16), ni4);
#endif
}
}
#else #else
// scalar // scalar
const int blck_size_interleave = 8; const int blck_size_interleave = 8;
@ -337,34 +512,19 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
} }
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) { UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4); return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
} }
else {
assert(false);
return 0;
}
}
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) { UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8); return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
} }
else {
assert(false);
return 0;
}
}
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) { UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8); return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
} }
else {
assert(false);
return 0;
}
}
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
const int qk = QK8_0; const int qk = QK8_0;
@ -699,6 +859,96 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
"performance"); "performance");
#elif defined(__AVX2__)
// Lookup table to convert signed nibbles to signed bytes
__m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
__m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
__m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
// Permute mask used for easier vector processing at later stages
const __m256i m4b = _mm256_set1_epi8(0x0F);
int64_t b_nb = n / QK4_0;
const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
const block_q8_0 * a_ptr_start = (const block_q8_0 *)vy;
// Process Q8_0 blocks one by one
for (int64_t y = 0; y < nr; y++) {
// Pointers to LHS blocks of block_q8_0 format
const block_q8_0 * a_ptr = a_ptr_start + (y * nb);
// Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
for (int64_t x = 0; x < nc / 8; x++) {
// Pointers to RHS blocks
const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
// Master FP accumulator
__m256 acc_row = _mm256_setzero_ps();
for (int64_t b = 0; b < nb; b++) {
// Load 8 blocks of Q4_0 interleaved as 8 bytes (B0 - B7)
const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 1);
const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 2);
const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 3);
// 4-bit -> 8-bit - Sign is maintained
const __m256i rhs_vec_0123_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_0, m4b)); // B0(0-7) B1(0-7) B2(0-7) B3(0-7)
const __m256i rhs_vec_4567_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_0, m4b)); // B4(0-7) B5(0-7) B6(0-7) B7(0-7)
const __m256i rhs_vec_0123_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
const __m256i rhs_vec_4567_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_0, 4), m4b)); // B0(16-23) B1(16-23) B2(16-23) B3(16-23)
const __m256i rhs_vec_4567_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_0, 4), m4b)); // B4(16-23) B5(16-23) B6(16-23) B7(16-23)
const __m256i rhs_vec_0123_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_1, 4), m4b)); // B0(24-31) B1(24-31) B2(24-31) B3(24-31)
const __m256i rhs_vec_4567_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_1, 4), m4b)); // B4(24-31) B5(24-31) B6(24-31) B7(24-31)
// Load the scale values for the 8 blocks interleaved in block_q4_0x8
const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
// Load and convert to FP32 scale from block_q8_0
const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
// Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
__m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
__m256i lhs_vec_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 16)));
lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0); // A0 (0-15) A0(0-15)
lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0); // A0 (16-31) A0(16-31))
__m256i iacc = _mm256_setzero_si256();
// Dot product done within 32 bit lanes and accumulated in the same vector
// B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
// B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
// ...........................................................................
// B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
// Accumulated values multipled with appropriate scales
acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
}
// Accumulated output values permuted so as to be stored in appropriate order post accumulation
acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask);
_mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
}
}
#else #else
float sumf[8]; float sumf[8];
int sumi; int sumi;
@ -2158,6 +2408,353 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) && GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal " "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
"performance"); "performance");
#elif defined(__AVX2__) || defined(__AVX512F__)
const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
int64_t b_nb = n / QK4_0;
int64_t y = 0;
// Mask to mask out nibbles from packed bytes
const __m256i m4b = _mm256_set1_epi8(0x0F);
const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
// Lookup table to convert signed nibbles to signed bytes
__m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
// Permute mask used for easier vector processing at later stages
__m256i requiredOrder = _mm256_set_epi32(3 ,2 ,1 ,0, 7 ,6, 5, 4);
// Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
int anr = nr - nr %16; // Used to align nr with boundary of 16
for (; y < anr / 4; y += 4) {
const block_q8_0x4 * a_ptrs[4];
a_ptrs[0] = a_ptr_start + (y * nb);
for (int i = 0; i < 3; ++i) {
a_ptrs[i + 1] = a_ptrs[i] + nb;
}
// Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
for (int64_t x = 0; x < nc / 8; x++) {
const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
// Master FP accumulators
__m256 acc_rows[16];
for (int i = 0; i < 16; i++) {
acc_rows[i] = _mm256_setzero_ps();
}
for (int64_t b = 0; b < nb; b++) {
// Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
// 4-bit -> 8-bit - Sign is maintained
const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
// Shuffle pattern one - right side input
const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
// Shuffle pattern two - right side input
const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
// Scale values - Load the wight scale values of block_q4_0x8
const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
// Process LHS in groups of four
for (int rp = 0; rp < 4; rp++) {
// Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
// Loaded as set of 128 bit vectors and repeated into a 256 bit vector
__m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
__m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
__m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
__m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
__m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
__m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
__m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
__m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
__m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
__m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
__m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
__m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
// Shuffle pattern one - left side input
const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
// Shuffle pattern two - left side input
const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
// The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
// Resembles MMLAs into 2x2 matrices in ARM Version
__m256i iacc_mat_00_sp1 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
__m256i iacc_mat_01_sp1 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
__m256i iacc_mat_10_sp1 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
__m256i iacc_mat_11_sp1 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
__m256i iacc_mat_00_sp2 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
__m256i iacc_mat_01_sp2 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
__m256i iacc_mat_10_sp2 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
__m256i iacc_mat_11_sp2 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
// Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
__m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
__m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
__m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
__m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
// Straighten out to make 4 row vectors
__m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
__m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
__m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
__m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
// Multiply with appropiate scales and accumulate
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
}
}
// Store the accumulated values
for (int i = 0; i < 16; i++) {
_mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
}
}
}
// Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
for (; y < nr / 4; y ++) {
const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
// Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
for (int64_t x = 0; x < nc / 8; x++) {
const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
// Master FP accumulators
__m256 acc_rows[4];
for (int i = 0; i < 4; i++) {
acc_rows[i] = _mm256_setzero_ps();
}
for (int64_t b = 0; b < nb; b++) {
// Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
// 4-bit -> 8-bit - Sign is maintained
const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
// Shuffle pattern one - right side input
const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
// Shuffle pattern two - right side input
const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
// Scale values - Load the wight scale values of block_q4_0x8
const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
// Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
// Loaded as set of 128 bit vectors and repeated into a 256 bit vector
__m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
__m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
__m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
__m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
__m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
__m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
__m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
__m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
__m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
__m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
__m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
__m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
// Shuffle pattern one - left side input
const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
// Shuffle pattern two - left side input
const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
// The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
// Resembles MMLAs into 2x2 matrices in ARM Version
__m256i iacc_mat_00_sp1 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
__m256i iacc_mat_01_sp1 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
__m256i iacc_mat_10_sp1 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
__m256i iacc_mat_11_sp1 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
__m256i iacc_mat_00_sp2 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
__m256i iacc_mat_01_sp2 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
__m256i iacc_mat_10_sp2 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
__m256i iacc_mat_11_sp2 =
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
// Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
__m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
__m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
__m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
__m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
// Straighten out to make 4 row vectors
__m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
__m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
__m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
__m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
// Multiply with appropiate scales and accumulate
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
}
// Store the accumulated values
for (int i = 0; i < 4; i++) {
_mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
}
}
}
#else #else
float sumf[4][8]; float sumf[4][8];
int sumi; int sumi;

View file

@ -723,6 +723,8 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
struct ggml_backend_cpu_context { struct ggml_backend_cpu_context {
int n_threads; int n_threads;
ggml_threadpool_t threadpool;
void * work_data; void * work_data;
size_t work_size; size_t work_size;
@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) { if (cpu_plan->cplan.work_size > 0) {
@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) { if (cpu_ctx->work_size < cplan.work_size) {
free(cpu_ctx->work_data); free(cpu_ctx->work_data);
@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
} }
ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->threadpool = NULL;
ctx->work_data = NULL; ctx->work_data = NULL;
ctx->work_size = 0; ctx->work_size = 0;
ctx->abort_callback = NULL; ctx->abort_callback = NULL;
@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
ctx->n_threads = n_threads; ctx->n_threads = n_threads;
} }
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
if (ctx->threadpool && ctx->threadpool != threadpool) {
// already had a different threadpool, pause/suspend it before switching
ggml_threadpool_pause(ctx->threadpool);
}
ctx->threadpool = threadpool;
}
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));

View file

@ -9,8 +9,10 @@
#include "ggml-cuda/binbcast.cuh" #include "ggml-cuda/binbcast.cuh"
#include "ggml-cuda/clamp.cuh" #include "ggml-cuda/clamp.cuh"
#include "ggml-cuda/concat.cuh" #include "ggml-cuda/concat.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh"
#include "ggml-cuda/convert.cuh" #include "ggml-cuda/convert.cuh"
#include "ggml-cuda/cpy.cuh" #include "ggml-cuda/cpy.cuh"
#include "ggml-cuda/cross-entropy-loss.cuh"
#include "ggml-cuda/diagmask.cuh" #include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/dmmv.cuh" #include "ggml-cuda/dmmv.cuh"
#include "ggml-cuda/fattn.cuh" #include "ggml-cuda/fattn.cuh"
@ -29,7 +31,6 @@
#include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh" #include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh" #include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh"
#include <algorithm> #include <algorithm>
#include <array> #include <array>
@ -2181,6 +2182,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ADD: case GGML_OP_ADD:
ggml_cuda_op_add(ctx, dst); ggml_cuda_op_add(ctx, dst);
break; break;
case GGML_OP_SUB:
ggml_cuda_op_sub(ctx, dst);
break;
case GGML_OP_ACC: case GGML_OP_ACC:
ggml_cuda_op_acc(ctx, dst); ggml_cuda_op_acc(ctx, dst);
break; break;
@ -2267,6 +2271,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SQRT: case GGML_OP_SQRT:
ggml_cuda_op_sqrt(ctx, dst); ggml_cuda_op_sqrt(ctx, dst);
break; break;
case GGML_OP_SIN:
ggml_cuda_op_sin(ctx, dst);
break;
case GGML_OP_COS:
ggml_cuda_op_cos(ctx, dst);
break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
ggml_cuda_op_clamp(ctx, dst); ggml_cuda_op_clamp(ctx, dst);
break; break;
@ -2303,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_FLASH_ATTN_EXT: case GGML_OP_FLASH_ATTN_EXT:
ggml_cuda_flash_attn_ext(ctx, dst); ggml_cuda_flash_attn_ext(ctx, dst);
break; break;
case GGML_OP_CROSS_ENTROPY_LOSS:
ggml_cuda_cross_entropy_loss(ctx, dst);
break;
default: default:
return false; return false;
} }
@ -2610,6 +2623,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) { if (node->src[j] != nullptr) {
assert(node->src[j]->buffer);
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
} }
} }
@ -2853,12 +2867,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_TRANSPOSE: case GGML_OP_TRANSPOSE:
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SQRT: case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_CONT: case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
@ -2890,6 +2907,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
} }
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA && return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
case GGML_OP_CROSS_ENTROPY_LOSS:
return true;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
default: default:
return false; return false;

View file

@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
return a + b; return a + b;
} }
static __device__ __forceinline__ float op_sub(const float a, const float b) {
return a - b;
}
static __device__ __forceinline__ float op_mul(const float a, const float b) { static __device__ __forceinline__ float op_mul(const float a, const float b) {
return a * b; return a * b;
} }
@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
} }
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
}
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
} }

View file

@ -2,5 +2,6 @@
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -0,0 +1,106 @@
#include "common.cuh"
#include "cross-entropy-loss.cuh"
#include "sumrows.cuh"
#include <cmath>
#include <cstdint>
static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
const int ne_tmp = WARP_SIZE*nclasses;
extern __shared__ float tmp_all[];
float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
// Each warp first loads ne_tmp logits/labels into shared memory:
for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
const int ig = i0*nclasses + i; // ig == i global
tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
}
// Each thread in the warp then calculates the cross entropy loss for a single row.
// TODO: pad in order to avoid shared memory bank conflicts.
// Find maximum for softmax:
float max = -INFINITY;
for (int i = 0; i < nclasses; ++i) {
max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
}
// Calculate log(softmax(logits)) which is just logits - max:
float sum = 0.0f;
for (int i = 0; i < nclasses; ++i) {
float val = tmp_logits[lane_id*nclasses + i] - max;
sum += expf(val);
tmp_logits[lane_id*nclasses + i] = val;
}
sum = logf(sum);
// log(exp(logits - max) / sum) = (logits - max) - log(sum)
float loss = 0.0f;
for (int i = 0; i < nclasses; ++i) {
loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
}
loss = -warp_reduce_sum(loss) / (float)k;
__syncthreads();
if (lane_id == 0) {
tmp_all[warp_id] = loss;
}
__syncthreads();
if (warp_id != 0) {
return;
}
loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
loss = warp_reduce_sum(loss);
if (lane_id != 0) {
return;
}
dst[blockIdx.x] = loss;
}
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(ggml_is_contiguous(dst));
const int64_t ne00 = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();
const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
// Combine results from individual blocks:
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
}

View file

@ -0,0 +1,5 @@
#include "common.cuh"
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -22,6 +22,7 @@ typedef void (* fattn_kernel_t)(
const float m0, const float m0,
const float m1, const float m1,
const uint32_t n_head_log2, const uint32_t n_head_log2,
const float logit_softcap,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -659,9 +660,15 @@ void launch_fattn(
float scale = 1.0f; float scale = 1.0f;
float max_bias = 0.0f; float max_bias = 0.0f;
float logit_softcap = 0.0f;
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float)); memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float)); memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
memcpy(&logit_softcap, (float *) KQV->op_params + 2, sizeof(float));
if (logit_softcap != 0.0f) {
scale /= logit_softcap;
}
const uint32_t n_head = Q->ne[2]; const uint32_t n_head = Q->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
@ -675,7 +682,7 @@ void launch_fattn(
V_data, V_data,
mask ? ((const char *) mask->data) : nullptr, mask ? ((const char *) mask->data) : nullptr,
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr, (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale, max_bias, m0, m1, n_head_log2, scale, max_bias, m0, m1, n_head_log2, logit_softcap,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3], K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,

View file

@ -4,7 +4,7 @@
#define FATTN_KQ_STRIDE_TILE_F16 64 #define FATTN_KQ_STRIDE_TILE_F16 64
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f16(
const float m0, const float m0,
const float m1, const float m1,
const uint32_t n_head_log2, const uint32_t n_head_log2,
const float logit_softcap,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -44,6 +45,12 @@ static __global__ void flash_attn_tile_ext_f16(
const int ne2, const int ne2,
const int ne3) { const int ne3) {
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices. //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
@ -154,7 +161,13 @@ static __global__ void flash_attn_tile_ext_f16(
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y; const int j_KQ = j_KQ_0 + threadIdx.y;
half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]); half sum;
if (use_logit_softcap) {
const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
sum = logit_softcap * tanhf(tmp.x + tmp.y);
} else {
sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
}
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f); sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum); kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
@ -270,20 +283,20 @@ static __global__ void flash_attn_tile_ext_f16(
#endif // FP16_AVAILABLE #endif // FP16_AVAILABLE
} }
template <int cols_per_block, int parallel_blocks> template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) { switch (Q->ne[0]) {
case 64: { case 64: {
constexpr int D = 64; constexpr int D = 64;
constexpr int nwarps = 8; constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break; } break;
case 128: { case 128: {
constexpr int D = 128; constexpr int D = 128;
constexpr int nwarps = 8; constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break; } break;
default: { default: {
@ -296,24 +309,45 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
const ggml_tensor * KQV = dst; const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[2]; const int32_t precision = KQV->op_params[3];
GGML_ASSERT(precision == GGML_PREC_DEFAULT); GGML_ASSERT(precision == GGML_PREC_DEFAULT);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] <= 16) { if (Q->ne[1] <= 16) {
constexpr int cols_per_block = 16; constexpr int cols_per_block = 16;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
return; return;
} }
if (Q->ne[1] <= 32) { if (Q->ne[1] <= 32) {
constexpr int cols_per_block = 32; constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
return; return;
} }
constexpr int cols_per_block = 32; constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 1; constexpr int parallel_blocks = 1;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
} }

View file

@ -4,7 +4,7 @@
#define FATTN_KQ_STRIDE_TILE_F32 32 #define FATTN_KQ_STRIDE_TILE_F32 32
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f32(
const float m0, const float m0,
const float m1, const float m1,
const uint32_t n_head_log2, const uint32_t n_head_log2,
const float logit_softcap,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -43,6 +44,12 @@ static __global__ void flash_attn_tile_ext_f32(
const int ne1, const int ne1,
const int ne2, const int ne2,
const int ne3) { const int ne3) {
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices. //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
@ -151,6 +158,10 @@ static __global__ void flash_attn_tile_ext_f32(
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y; const int j_KQ = j_KQ_0 + threadIdx.y;
if (use_logit_softcap) {
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
}
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]); kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
@ -267,20 +278,20 @@ static __global__ void flash_attn_tile_ext_f32(
} }
} }
template <int cols_per_block, int parallel_blocks> template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) { switch (Q->ne[0]) {
case 64: { case 64: {
constexpr int D = 64; constexpr int D = 64;
constexpr int nwarps = 8; constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break; } break;
case 128: { case 128: {
constexpr int D = 128; constexpr int D = 128;
constexpr int nwarps = 8; constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>; fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break; } break;
default: { default: {
@ -290,23 +301,45 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
} }
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] <= 16) { if (Q->ne[1] <= 16) {
constexpr int cols_per_block = 16; constexpr int cols_per_block = 16;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
return; return;
} }
if (Q->ne[1] <= 32) { if (Q->ne[1] <= 32) {
constexpr int cols_per_block = 32; constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
return; return;
} }
constexpr int cols_per_block = 32; constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 1; constexpr int parallel_blocks = 1;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
} }

View file

@ -1,7 +1,7 @@
#include "common.cuh" #include "common.cuh"
#include "fattn-common.cuh" #include "fattn-common.cuh"
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -17,6 +17,7 @@ static __global__ void flash_attn_vec_ext_f16(
const float m0, const float m0,
const float m1, const float m1,
const uint32_t n_head_log2, const uint32_t n_head_log2,
const float logit_softcap,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -41,6 +42,12 @@ static __global__ void flash_attn_vec_ext_f16(
const int ne2, const int ne2,
const int ne3) { const int ne3) {
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices. //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K); constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
@ -190,6 +197,11 @@ static __global__ void flash_attn_vec_ext_f16(
for (int j = 0; j < ncols; ++j) { for (int j = 0; j < ncols; ++j) {
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]); half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum(sum); sum = warp_reduce_sum(sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f); sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
if (ncols == 1) { if (ncols == 1) {
@ -286,10 +298,10 @@ static __global__ void flash_attn_vec_ext_f16(
#endif // FP16_AVAILABLE #endif // FP16_AVAILABLE
} }
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V> template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
constexpr int nwarps = D/WARP_SIZE; constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V>; fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = D != 128; constexpr bool need_f16_K = D != 128;
constexpr bool need_f16_V = D != 128 && D != 64; constexpr bool need_f16_V = D != 128 && D != 64;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
@ -297,48 +309,81 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
template <int D, ggml_type type_K, ggml_type type_V> template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * KQV = dst; const ggml_tensor * KQV = dst;
ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1]; const ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2]; const ggml_tensor * V = dst->src[2];
const int32_t precision = KQV->op_params[2]; const int32_t precision = KQV->op_params[3];
GGML_ASSERT(precision == GGML_PREC_DEFAULT); GGML_ASSERT(precision == GGML_PREC_DEFAULT);
GGML_ASSERT(K->type == type_K); GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V); GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] == 1) { if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1; constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return; return;
} }
if (Q->ne[1] == 2) { if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2; constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return; return;
} }
if (Q->ne[1] <= 4) { if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4; constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return; return;
} }
if (Q->ne[1] <= 8) { if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8; constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return; return;
} }
constexpr int cols_per_block = 8; constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1; constexpr int parallel_blocks = 1;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
} }
#define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V) \ #define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V) \

View file

@ -1,7 +1,7 @@
#include "common.cuh" #include "common.cuh"
#include "fattn-common.cuh" #include "fattn-common.cuh"
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -17,6 +17,7 @@ static __global__ void flash_attn_vec_ext_f32(
const float m0, const float m0,
const float m1, const float m1,
const uint32_t n_head_log2, const uint32_t n_head_log2,
const float logit_softcap,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -40,6 +41,12 @@ static __global__ void flash_attn_vec_ext_f32(
const int ne1, const int ne1,
const int ne2, const int ne2,
const int ne3) { const int ne3) {
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices. //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K); constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
@ -180,6 +187,11 @@ static __global__ void flash_attn_vec_ext_f32(
for (int j = 0; j < ncols; ++j) { for (int j = 0; j < ncols; ++j) {
float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]); float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum(sum); sum = warp_reduce_sum(sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f; sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum); kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
@ -267,10 +279,10 @@ static __global__ void flash_attn_vec_ext_f32(
} }
} }
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V> template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
constexpr int nwarps = D/WARP_SIZE; constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V>; fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = D != 128; constexpr bool need_f16_K = D != 128;
constexpr bool need_f16_V = D != 128 && D != 64; constexpr bool need_f16_V = D != 128 && D != 64;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
@ -278,44 +290,78 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
template <int D, ggml_type type_K, ggml_type type_V> template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[0]; const ggml_tensor * KQV = dst;
ggml_tensor * K = dst->src[1]; const ggml_tensor * Q = dst->src[0];
ggml_tensor * V = dst->src[2]; const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
GGML_ASSERT(K->type == type_K); GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V); GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] == 1) { if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1; constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return; return;
} }
if (Q->ne[1] == 2) { if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2; constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return; return;
} }
if (Q->ne[1] <= 4) { if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4; constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return; return;
} }
if (Q->ne[1] <= 8) { if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8; constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return; return;
} }
constexpr int cols_per_block = 8; constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1; constexpr int parallel_blocks = 1;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst); if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
} }
#define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V) \ #define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V) \

View file

@ -6,7 +6,7 @@
#endif // FP16_MMA_AVAILABLE #endif // FP16_MMA_AVAILABLE
// D == head size, VKQ_stride == num VKQ rows calculated in parallel: // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t> template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -22,6 +22,7 @@ static __global__ void flash_attn_ext_f16(
const float m0, const float m0,
const float m1, const float m1,
const uint32_t n_head_log2, const uint32_t n_head_log2,
const float logit_softcap,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -46,6 +47,12 @@ static __global__ void flash_attn_ext_f16(
const int ne2, const int ne2,
const int ne3) { const int ne3) {
#ifdef FP16_MMA_AVAILABLE #ifdef FP16_MMA_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices. //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on. const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
@ -85,6 +92,8 @@ static __global__ void flash_attn_ext_f16(
const half slopeh = __float2half(slopef); const half slopeh = __float2half(slopef);
const half2 slope2 = make_half2(slopef, slopef); const half2 slope2 = make_half2(slopef, slopef);
const half2 logit_softcap_2 = make_half2(logit_softcap, logit_softcap);
frag_b Q_b[D/16][ncols/frag_n]; frag_b Q_b[D/16][ncols/frag_n];
// A single buffer for temporarily holding tiles of KQ and VKQ parts: // A single buffer for temporarily holding tiles of KQ and VKQ parts:
@ -194,6 +203,10 @@ static __global__ void flash_attn_ext_f16(
const int k = k0 + threadIdx.x; const int k = k0 + threadIdx.x;
KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k]; KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
if (use_logit_softcap) {
KQ_f_tmp[k0/WARP_SIZE] = logit_softcap*tanhf(KQ_f_tmp[k0/WARP_SIZE]);
}
} }
float KQ_max_new = KQ_max_f[j0/nwarps]; float KQ_max_new = KQ_max_f[j0/nwarps];
@ -237,6 +250,15 @@ static __global__ void flash_attn_ext_f16(
const int k = k0 + threadIdx.x; const int k = k0 + threadIdx.x;
KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k]; KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
if (use_logit_softcap) {
// There is no dedicated tangens hyperbolicus function for half2.
KQ2_tmp[k0/WARP_SIZE] = h2exp(KQ2_tmp[k0/WARP_SIZE]*make_half2(2.0f, 2.0f));
KQ2_tmp[k0/WARP_SIZE] = (KQ2_tmp[k0/WARP_SIZE] - make_half2(1.0f, 1.0f))
/(KQ2_tmp[k0/WARP_SIZE] + make_half2(1.0f, 1.0f));
KQ2_tmp[k0/WARP_SIZE] *= logit_softcap_2;
}
} }
half2 KQ_max_new = KQ_max_h2[j0/nwarps]; half2 KQ_max_new = KQ_max_h2[j0/nwarps];
@ -427,6 +449,7 @@ static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
template <int D, int cols_per_block, typename KQ_acc_t> template <int D, int cols_per_block, typename KQ_acc_t>
void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
constexpr int nwarps = 4; constexpr int nwarps = 4;
@ -435,20 +458,50 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]; const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (4*blocks_num_pb1 < 2*nsm) { if (4*blocks_num_pb1 < 2*nsm) {
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>; fattn_kernel_t fattn_kernel;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
} else {
constexpr bool use_logit_softcap = true;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
}
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
return; return;
} }
if (2*blocks_num_pb1 < 2*nsm) { if (2*blocks_num_pb1 < 2*nsm) {
constexpr int parallel_blocks = 2; constexpr int parallel_blocks = 2;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>; fattn_kernel_t fattn_kernel;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
} else {
constexpr bool use_logit_softcap = true;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
}
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
return; return;
} }
constexpr int parallel_blocks = 1; constexpr int parallel_blocks = 1;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>; fattn_kernel_t fattn_kernel;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
} else {
constexpr bool use_logit_softcap = true;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
}
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} }

View file

@ -13,7 +13,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
const ggml_tensor * KQV = dst; const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[2]; const int32_t precision = KQV->op_params[3];
if (precision != GGML_PREC_DEFAULT) { if (precision != GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 32 || Q->ne[0] > 128) { if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
@ -301,7 +301,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
ggml_cuda_set_device(ctx.device); ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int32_t precision = KQV->op_params[2]; const int32_t precision = KQV->op_params[3];
// On AMD the tile kernels perform poorly, use the vec kernel instead: // On AMD the tile kernels perform poorly, use the vec kernel instead:
if (cc >= CC_OFFSET_AMD) { if (cc >= CC_OFFSET_AMD) {

View file

@ -16,7 +16,7 @@ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int nc
} }
} }
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1); const dim3 block_nums(nrows, 1, 1);
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols); k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
@ -32,7 +32,6 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
const int64_t ncols = src0->ne[0]; const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);

View file

@ -1,3 +1,5 @@
#include "common.cuh" #include "common.cuh"
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -101,6 +101,24 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
dst[i] = sqrtf(x[i]); dst[i] = sqrtf(x[i]);
} }
static __global__ void sin_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = sinf(x[i]);
}
static __global__ void cos_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = cosf(x[i]);
}
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k); gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -156,6 +174,16 @@ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k); sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
@ -312,3 +340,31 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
} }
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}

View file

@ -9,6 +9,8 @@
#define CUDA_HARDSWISH_BLOCK_SIZE 256 #define CUDA_HARDSWISH_BLOCK_SIZE 256
#define CUDA_SQR_BLOCK_SIZE 256 #define CUDA_SQR_BLOCK_SIZE 256
#define CUDA_SQRT_BLOCK_SIZE 256 #define CUDA_SQRT_BLOCK_SIZE 256
#define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@ -31,3 +33,7 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -31,6 +31,8 @@ struct ggml_metal_kernel {
enum ggml_metal_kernel_type { enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_ADD, GGML_METAL_KERNEL_TYPE_ADD,
GGML_METAL_KERNEL_TYPE_ADD_ROW, GGML_METAL_KERNEL_TYPE_ADD_ROW,
GGML_METAL_KERNEL_TYPE_SUB,
GGML_METAL_KERNEL_TYPE_SUB_ROW,
GGML_METAL_KERNEL_TYPE_MUL, GGML_METAL_KERNEL_TYPE_MUL,
GGML_METAL_KERNEL_TYPE_MUL_ROW, GGML_METAL_KERNEL_TYPE_MUL_ROW,
GGML_METAL_KERNEL_TYPE_DIV, GGML_METAL_KERNEL_TYPE_DIV,
@ -82,6 +84,8 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_RMS_NORM,
GGML_METAL_KERNEL_TYPE_GROUP_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM,
GGML_METAL_KERNEL_TYPE_NORM, GGML_METAL_KERNEL_TYPE_NORM,
GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,
GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
@ -205,6 +209,9 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
GGML_METAL_KERNEL_TYPE_CONCAT, GGML_METAL_KERNEL_TYPE_CONCAT,
GGML_METAL_KERNEL_TYPE_SQR, GGML_METAL_KERNEL_TYPE_SQR,
GGML_METAL_KERNEL_TYPE_SQRT,
GGML_METAL_KERNEL_TYPE_SIN,
GGML_METAL_KERNEL_TYPE_COS,
GGML_METAL_KERNEL_TYPE_SUM_ROWS, GGML_METAL_KERNEL_TYPE_SUM_ROWS,
GGML_METAL_KERNEL_TYPE_COUNT GGML_METAL_KERNEL_TYPE_COUNT
@ -491,6 +498,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
@ -542,6 +551,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction);
@ -665,6 +676,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
} }
@ -765,15 +779,20 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_CONCAT: case GGML_OP_CONCAT:
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_ACC: case GGML_OP_ACC:
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_SQR:
case GGML_OP_SUM_ROWS:
return true; return true;
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_SUM_ROWS:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
@ -803,6 +822,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
return false; return false;
} }
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
case GGML_OP_SSM_CONV:
case GGML_OP_SSM_SCAN:
return true;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
return ctx->support_simdgroup_reduction && return ctx->support_simdgroup_reduction &&
@ -1050,6 +1072,7 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
{ {
@ -1073,6 +1096,7 @@ static enum ggml_status ggml_metal_graph_compute(
nb = ne00 / 4; nb = ne00 / 4;
switch (dst->op) { switch (dst->op) {
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
default: GGML_ABORT("fatal error"); default: GGML_ABORT("fatal error");
@ -1082,6 +1106,7 @@ static enum ggml_status ggml_metal_graph_compute(
} else { } else {
switch (dst->op) { switch (dst->op) {
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
default: GGML_ABORT("fatal error"); default: GGML_ABORT("fatal error");
@ -1409,6 +1434,48 @@ static enum ggml_status ggml_metal_graph_compute(
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SQRT:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SIN:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_COS:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
@ -1538,6 +1605,121 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} }
} break; } break;
case GGML_OP_SSM_CONV:
{
GGML_ASSERT(src0t == GGML_TYPE_F32);
GGML_ASSERT(src1t == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:15];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:16];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:17];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:18];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SSM_SCAN:
{
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
struct ggml_tensor * src4 = gf->nodes[i]->src[4];
struct ggml_tensor * src5 = gf->nodes[i]->src[5];
GGML_ASSERT(src3);
GGML_ASSERT(src4);
GGML_ASSERT(src5);
size_t offs_src3 = 0;
size_t offs_src4 = 0;
size_t offs_src5 = 0;
id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
id<MTLBuffer> id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil;
id<MTLBuffer> id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil;
const int64_t ne30 = src3->ne[0]; GGML_UNUSED(ne30);
const int64_t ne31 = src3->ne[1]; GGML_UNUSED(ne31);
const uint64_t nb30 = src3->nb[0];
const uint64_t nb31 = src3->nb[1];
const int64_t ne40 = src4->ne[0]; GGML_UNUSED(ne40);
const int64_t ne41 = src4->ne[1]; GGML_UNUSED(ne41);
const int64_t ne42 = src4->ne[2]; GGML_UNUSED(ne42);
const uint64_t nb40 = src4->nb[0];
const uint64_t nb41 = src4->nb[1];
const uint64_t nb42 = src4->nb[2];
const int64_t ne50 = src5->ne[0]; GGML_UNUSED(ne50);
const int64_t ne51 = src5->ne[1]; GGML_UNUSED(ne51);
const int64_t ne52 = src5->ne[2]; GGML_UNUSED(ne52);
const uint64_t nb50 = src5->nb[0];
const uint64_t nb51 = src5->nb[1];
const uint64_t nb52 = src5->nb[2];
const int64_t d_state = ne00;
const int64_t d_inner = ne01;
const int64_t n_seq_tokens = ne11;
const int64_t n_seqs = ne02;
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
[encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
[encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
[encoder setBuffer:id_dst offset:offs_dst atIndex:6];
[encoder setBytes:&d_state length:sizeof(d_state) atIndex:7];
[encoder setBytes:&d_inner length:sizeof(d_inner) atIndex:8];
[encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9];
[encoder setBytes:&n_seqs length:sizeof(n_seqs) atIndex:10];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
[encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19];
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20];
[encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21];
[encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22];
[encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23];
[encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24];
[encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25];
[encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26];
[encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27];
[encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28];
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
GGML_ASSERT(ne00 == ne10); GGML_ASSERT(ne00 == ne10);
@ -2624,9 +2806,14 @@ static enum ggml_status ggml_metal_graph_compute(
float scale; float scale;
float max_bias; float max_bias;
float logit_softcap;
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale)); memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias)); memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
memcpy(&logit_softcap, ((int32_t *) dst->op_params) + 2, sizeof(logit_softcap));
if (logit_softcap != 0.0f) {
scale /= logit_softcap;
}
const uint32_t n_head = src0->ne[2]; const uint32_t n_head = src0->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
@ -2701,6 +2888,7 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25]; [encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26]; [encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27]; [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
[encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
if (!use_vec_kernel) { if (!use_vec_kernel) {
// half8x8 kernel // half8x8 kernel

View file

@ -17,7 +17,7 @@ enum ggml_sort_order {
GGML_SORT_ORDER_DESC, GGML_SORT_ORDER_DESC,
}; };
// general-purpose kernel for addition, multiplication and division of two tensors // general-purpose kernel for addition, subtraction, multiplication and division of two tensors
// pros: works for non-contiguous tensors, supports broadcast across all dims // pros: works for non-contiguous tensors, supports broadcast across all dims
// cons: not very efficient // cons: not very efficient
kernel void kernel_add( kernel void kernel_add(
@ -70,6 +70,56 @@ kernel void kernel_add(
} }
} }
kernel void kernel_sub(
device const char * src0,
device const char * src1,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int64_t & offs,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig.z;
const int64_t i02 = tgpig.y;
const int64_t i01 = tgpig.x;
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + offs;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
const int i10 = i0 % ne10;
*((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10));
}
}
kernel void kernel_mul( kernel void kernel_mul(
device const char * src0, device const char * src0,
device const char * src1, device const char * src1,
@ -226,6 +276,15 @@ kernel void kernel_add_row(
dst[tpig] = src0[tpig] + src1[tpig % nb]; dst[tpig] = src0[tpig] + src1[tpig % nb];
} }
kernel void kernel_sub_row(
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
constant uint64_t & nb [[buffer(28)]],
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] - src1[tpig % nb];
}
kernel void kernel_mul_row( kernel void kernel_mul_row(
device const float4 * src0, device const float4 * src0,
device const float4 * src1, device const float4 * src1,
@ -358,6 +417,27 @@ kernel void kernel_sqr(
dst[tpig] = src0[tpig] * src0[tpig]; dst[tpig] = src0[tpig] * src0[tpig];
} }
kernel void kernel_sqrt(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = sqrt(src0[tpig]);
}
kernel void kernel_sin(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = sin(src0[tpig]);
}
kernel void kernel_cos(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = cos(src0[tpig]);
}
kernel void kernel_sum_rows( kernel void kernel_sum_rows(
device const float * src0, device const float * src0,
device float * dst, device float * dst,
@ -667,6 +747,127 @@ kernel void kernel_diag_mask_inf_8(
} }
} }
// ref: ggml.c:ggml_compute_forward_ssm_conv_f32
// TODO: optimize
kernel void kernel_ssm_conv_f32(
device const void * src0,
device const void * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t ir = tgpig.x;
const int64_t i2 = tgpig.y;
const int64_t i3 = tgpig.z;
const int64_t nc = ne10;
const int64_t ncs = ne00;
const int64_t nr = ne01;
const int64_t n_t = ne1;
const int64_t n_s = ne2;
device const float * s = (device const float *) ((device const char *) src0 + ir*nb01 + i2*nb00 + i3*nb02);
device const float * c = (device const float *) ((device const char *) src1 + ir*nb11);
device float * x = (device float *) ((device char *) dst + ir*nb0 + i2*nb1 + i3*nb2);
float sumf = 0.0f;
for (int64_t i0 = 0; i0 < nc; ++i0) {
sumf += s[i0] * c[i0];
}
x[0] = sumf;
}
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32
// TODO: optimize
kernel void kernel_ssm_scan_f32(
device const void * src0,
device const void * src1,
device const void * src2,
device const void * src3,
device const void * src4,
device const void * src5,
device float * dst,
constant int64_t & d_state,
constant int64_t & d_inner,
constant int64_t & n_seq_tokens,
constant int64_t & n_seqs,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant uint64_t & nb20,
constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb30,
constant uint64_t & nb31,
constant uint64_t & nb40,
constant uint64_t & nb41,
constant uint64_t & nb42,
constant uint64_t & nb50,
constant uint64_t & nb51,
constant uint64_t & nb52,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t ir = tgpig.x;
const int64_t i3 = tgpig.y;
const int64_t nc = d_state;
const int64_t nr = d_inner;
const int64_t n_t = n_seq_tokens;
const int64_t n_s = n_seqs;
for (int64_t i2 = 0; i2 < n_t; ++i2) {
device const float * s0 = (device const float *) ((device const char *) src0 + ir*nb01 + i3*nb02);
device const float * x = (device const float *) ((device const char *) src1 + ir*nb10 + i2*nb11 + i3*nb12);
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*nb21 + i3*nb22);
device const float * A = (device const float *) ((device const char *) src3 + ir*nb31);
device const float * B = (device const float *) ((device const char *) src4 + i2*nb41 + i3*nb42);
device const float * C = (device const float *) ((device const char *) src5 + i2*nb51 + i3*nb52);
device float * y = (device float *) ((device char *) dst + ir*nb10 + i2*nb11 + i3*nb12); // TODO: do not use src1 strides
device float * s = (device float *) ((device char *) dst + ir*nb01 + i3*nb02 + nb13);
if (i2 > 0) {
s0 = s;
}
// i1 == 0
float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
float x_dt = x[0] * dt_soft_plus;
float sumf = 0.0f;
for (int64_t i0 = 0; i0 < nc; ++i0) {
int64_t i = i0;
float state = (s0[i] * exp(dt_soft_plus * A[i])) + (B[i0] * x_dt);
sumf += state * C[i0];
s[i] = state;
}
y[0] = sumf;
}
}
kernel void kernel_norm( kernel void kernel_norm(
device const void * src0, device const void * src0,
device float * dst, device float * dst,
@ -1976,6 +2177,7 @@ typedef void (flash_attn_ext_f16_t)(
constant float & m0, constant float & m0,
constant float & m1, constant float & m1,
constant uint32_t & n_head_log2, constant uint32_t & n_head_log2,
constant float & logit_softcap,
threadgroup half * shared, threadgroup half * shared,
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2014,6 +2216,7 @@ kernel void kernel_flash_attn_ext_f16(
constant float & m0, constant float & m0,
constant float & m1, constant float & m1,
constant uint32_t & n_head_log2, constant uint32_t & n_head_log2,
constant float & logit_softcap,
threadgroup half * shared [[threadgroup(0)]], threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2138,19 +2341,6 @@ kernel void kernel_flash_attn_ext_f16(
} }
simdgroup_store(mqk, ss + 8*cc, TF, 0, false); simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
const short tx = tiisg%4;
const short ty = tiisg/4;
if (mask != q) {
// mqk = mqk*scale + mask*slope
ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0];
ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1];
} else {
// mqk = mqk*scale
ss[8*cc + ty*TF + 2*tx + 0] *= scale;
ss[8*cc + ty*TF + 2*tx + 1] *= scale;
}
} }
} }
@ -2162,10 +2352,19 @@ kernel void kernel_flash_attn_ext_f16(
float ms[Q]; float ms[Q];
for (short j = 0; j < Q; ++j) { for (short j = 0; j < Q; ++j) {
const short p = tiisg;
const float m = M[j]; const float m = M[j];
const float s = ss[j*TF + p];
// scale and apply the logitcap / mask
float s = ss[j*TF + tiisg]*scale;
if (logit_softcap != 0.0f) {
s = logit_softcap*precise::tanh(s);
}
if (mask != q) {
// mqk = mqk + mask*slope
s += slope*mp[ic + j*nb31/sizeof(half) + tiisg];
}
smax = simd_max(max(smax, s)); smax = simd_max(max(smax, s));
M[j] = simd_max(max(M[j], s)); M[j] = simd_max(max(M[j], s));
@ -2176,7 +2375,7 @@ kernel void kernel_flash_attn_ext_f16(
S[j] = S[j]*ms[j] + simd_sum(vs); S[j] = S[j]*ms[j] + simd_sum(vs);
// the P matrix from the paper (Q rows, C columns) // the P matrix from the paper (Q rows, C columns)
ss[j*TF + p] = vs; ss[j*TF + tiisg] = vs;
} }
// create a QxQ diagonal matrix for rescaling the output // create a QxQ diagonal matrix for rescaling the output
@ -2345,6 +2544,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
constant float & m0, constant float & m0,
constant float & m1, constant float & m1,
constant uint32_t & n_head_log2, constant uint32_t & n_head_log2,
constant float & logit_softcap,
threadgroup half * shared [[threadgroup(0)]], threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2479,7 +2679,13 @@ kernel void kernel_flash_attn_ext_vec_f16(
// mqk = mqk*scale + mask*slope // mqk = mqk*scale + mask*slope
if (tiisg == 0) { if (tiisg == 0) {
mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f); mqk *= scale;
if (logit_softcap != 0.0f) {
mqk = logit_softcap*precise::tanh(mqk);
}
mqk += (mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f;
ss4[cc] = mqk; ss4[cc] = mqk;
} }

View file

@ -3829,7 +3829,7 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
quantize_row_q8_K_ref(x, y, k); quantize_row_q8_K_ref(x, y, k);
} }
//===================================== Dot ptoducts ================================= //===================================== Dot products =================================
// //
// Helper functions // Helper functions

View file

@ -76,8 +76,8 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
} }
// sum up partial sums and write back result // sum up partial sums and write back result
#pragma unroll const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { for (int mask = mask_start; mask > 0; mask >>= 1) {
tmp += tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
} }

View file

@ -188,6 +188,8 @@ struct vk_device_struct {
vk_pipeline pipeline_upscale_f32; vk_pipeline pipeline_upscale_f32;
vk_pipeline pipeline_scale_f32; vk_pipeline pipeline_scale_f32;
vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_sqr_f32;
vk_pipeline pipeline_sin_f32;
vk_pipeline pipeline_cos_f32;
vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_clamp_f32;
vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_pad_f32;
vk_pipeline pipeline_repeat_f32; vk_pipeline pipeline_repeat_f32;
@ -1702,6 +1704,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@ -4023,6 +4027,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_sqr_f32; return ctx->device->pipeline_sqr_f32;
} }
return nullptr; return nullptr;
case GGML_OP_SIN:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_sin_f32;
}
return nullptr;
case GGML_OP_COS:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_cos_f32;
}
return nullptr;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_clamp_f32; return ctx->device->pipeline_clamp_f32;
@ -4171,6 +4185,8 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
@ -4381,6 +4397,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
@ -4598,6 +4616,32 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
}, dryrun); }, dryrun);
} }
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
float * op_params = (float *)dst->op_params; float * op_params = (float *)dst->op_params;
const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src0_type_size = ggml_type_size(src0->type);
@ -5658,6 +5702,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_CPY: case GGML_OP_CPY:
@ -5735,6 +5781,14 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_SQR: case GGML_OP_SQR:
ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun); ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
break;
case GGML_OP_SIN:
ggml_vk_sin(ctx, compute_ctx, src0, node);
break;
case GGML_OP_COS:
ggml_vk_cos(ctx, compute_ctx, src0, node);
break; break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun); ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
@ -5851,6 +5905,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_CPY: case GGML_OP_CPY:
@ -6582,6 +6638,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_CONT: case GGML_OP_CONT:
@ -7024,6 +7082,10 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]); tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
} else if (tensor->op == GGML_OP_SQR) { } else if (tensor->op == GGML_OP_SQR) {
tensor_clone = ggml_sqr(ggml_ctx, src0_clone); tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_SIN) {
tensor_clone = ggml_sin(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_COS) {
tensor_clone = ggml_cos(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_CLAMP) { } else if (tensor->op == GGML_OP_CLAMP) {
tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
} else if (tensor->op == GGML_OP_PAD) { } else if (tensor->op == GGML_OP_PAD) {

File diff suppressed because it is too large Load diff

View file

@ -606,17 +606,29 @@ class tinyBLAS_Q0_AVX {
case 0x44: case 0x44:
mc = 4; mc = 4;
nc = 4; nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
gemm4xN<4>(m0, m, n0, n);
#else
gemm<4, 4>(m0, m, n0, n); gemm<4, 4>(m0, m, n0, n);
#endif
break; break;
case 0x43: case 0x43:
mc = 4; mc = 4;
nc = 3; nc = 3;
#if defined(__AVX2__) && defined(__F16C__)
gemm4xN<3>(m0, m, n0, n);
#else
gemm<4, 3>(m0, m, n0, n); gemm<4, 3>(m0, m, n0, n);
#endif
break; break;
case 0x34: case 0x34:
mc = 3; mc = 3;
nc = 4; nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
gemmMx4<3>(m0, m, n0, n);
#else
gemm<3, 4>(m0, m, n0, n); gemm<3, 4>(m0, m, n0, n);
#endif
break; break;
case 0x33: case 0x33:
mc = 3; mc = 3;
@ -626,12 +638,20 @@ class tinyBLAS_Q0_AVX {
case 0x42: case 0x42:
mc = 4; mc = 4;
nc = 2; nc = 2;
#if defined(__AVX2__) && defined(__F16C__)
gemm4xN<2>(m0, m, n0, n);
#else
gemm<4, 2>(m0, m, n0, n); gemm<4, 2>(m0, m, n0, n);
#endif
break; break;
case 0x24: case 0x24:
mc = 2; mc = 2;
nc = 4; nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
gemmMx4<2>(m0, m, n0, n);
#else
gemm<2, 4>(m0, m, n0, n); gemm<2, 4>(m0, m, n0, n);
#endif
break; break;
#else #else
case 0x44: case 0x44:
@ -639,13 +659,21 @@ class tinyBLAS_Q0_AVX {
case 0x42: case 0x42:
mc = 4; mc = 4;
nc = 2; nc = 2;
#if defined(__AVX2__) && defined(__F16C__)
gemm4xN<2>(m0, m, n0, n);
#else
gemm<4, 2>(m0, m, n0, n); gemm<4, 2>(m0, m, n0, n);
#endif
break; break;
case 0x34: case 0x34:
case 0x24: case 0x24:
mc = 2; mc = 2;
nc = 4; nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
gemmMx4<2>(m0, m, n0, n);
#else
gemm<2, 4>(m0, m, n0, n); gemm<2, 4>(m0, m, n0, n);
#endif
break; break;
case 0x33: case 0x33:
#endif #endif
@ -662,7 +690,11 @@ class tinyBLAS_Q0_AVX {
case 0x41: case 0x41:
mc = 4; mc = 4;
nc = 1; nc = 1;
#if defined(__AVX2__) && defined(__F16C__)
gemm4xN<1>(m0, m, n0, n);
#else
gemm<4, 1>(m0, m, n0, n); gemm<4, 1>(m0, m, n0, n);
#endif
break; break;
case 0x22: case 0x22:
mc = 2; mc = 2;
@ -672,7 +704,11 @@ class tinyBLAS_Q0_AVX {
case 0x14: case 0x14:
mc = 1; mc = 1;
nc = 4; nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
gemmMx4<1>(m0, m, n0, n);
#else
gemm<1, 4>(m0, m, n0, n); gemm<1, 4>(m0, m, n0, n);
#endif
break; break;
case 0x31: case 0x31:
mc = 3; mc = 3;
@ -708,6 +744,119 @@ class tinyBLAS_Q0_AVX {
mnpack(m0, m, np, n); mnpack(m0, m, np, n);
} }
#if defined(__AVX2__) && defined(__F16C__)
// Templated functions for gemm of dimensions 4xN
template <int RN>
NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / 4;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * 4;
int64_t jj = n0 + job % xtiles * RN;
__m256 Cv[RN][4] = {};
for (int64_t l = 0; l < k; ++l) {
uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
// Convert delta values for four blocks to float values
__m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
__m256i avec0 = load(A + lda * (ii + 0) + l);
__m256i avec1 = load(A + lda * (ii + 1) + l);
__m256i avec2 = load(A + lda * (ii + 2) + l);
__m256i avec3 = load(A + lda * (ii + 3) + l);
for (int64_t j = 0; j < RN; ++j) {
__m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
// Computation of product of delta values for four blocks and replicate it across 256 bit lane
__m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
// Computation of dot product and multiplication with appropriate delta value products
Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
updot(_mm256_sign_epi8(avec0, avec0),
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
Cv[j][0]);
Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
updot(_mm256_sign_epi8(avec1, avec1),
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
Cv[j][1]);
Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
updot(_mm256_sign_epi8(avec2, avec2),
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
Cv[j][2]);
Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
updot(_mm256_sign_epi8(avec3, avec3),
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
Cv[j][3]);
}
}
for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < 4; ++i)
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
}
}
// Templated functions for gemm of dimensions Mx4
template <int RM>
NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / 4;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * 4;
__m256 Cv[4][RM] = {};
for (int64_t l = 0; l < k; ++l) {
uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
// Convert delta values for four blocks to float values
__m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
__m256i bvec0 = load(B + ldb * (jj + 0) + l);
__m256i bvec1 = load(B + ldb * (jj + 1) + l);
__m256i bvec2 = load(B + ldb * (jj + 2) + l);
__m256i bvec3 = load(B + ldb * (jj + 3) + l);
for (int64_t i = 0; i < RM; ++i) {
__m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
// Computation of product of delta values for four blocks and replicate it across 256 bit lane
__m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
// Computation of dot product and multiplication with appropriate delta value products
Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
load(A + lda * (ii + i) + l)),
_mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
Cv[0][i]);
Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
load(A + lda * (ii + i) + l)),
_mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
Cv[1][i]);
Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
load(A + lda * (ii + i) + l)),
_mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
Cv[2][i]);
Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
load(A + lda * (ii + i) + l)),
_mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
Cv[3][i]);
}
}
for (int64_t j = 0; j < 4; ++j)
for (int64_t i = 0; i < RM; ++i)
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
}
}
#endif
template <int RM, int RN> template <int RM, int RN>
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM; int64_t ytiles = (m - m0) / RM;

View file

@ -0,0 +1,15 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
}

View file

@ -0,0 +1,15 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
}

View file

@ -396,6 +396,14 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
})); }));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] { tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
})); }));

View file

@ -94,6 +94,9 @@ class Keys:
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -132,6 +135,9 @@ class Keys:
TIME_STEP_RANK = "{arch}.ssm.time_step_rank" TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
class WKV:
HEAD_SIZE = "{arch}.wkv.head_size"
class Tokenizer: class Tokenizer:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model"
PRE = "tokenizer.ggml.pre" PRE = "tokenizer.ggml.pre"
@ -207,6 +213,7 @@ class MODEL_ARCH(IntEnum):
GEMMA = auto() GEMMA = auto()
GEMMA2 = auto() GEMMA2 = auto()
STARCODER2 = auto() STARCODER2 = auto()
RWKV6 = auto()
MAMBA = auto() MAMBA = auto()
XVERSE = auto() XVERSE = auto()
COMMAND_R = auto() COMMAND_R = auto()
@ -270,6 +277,29 @@ class MODEL_TENSOR(IntEnum):
SSM_A = auto() SSM_A = auto()
SSM_D = auto() SSM_D = auto()
SSM_OUT = auto() SSM_OUT = auto()
TIME_MIX_W1 = auto()
TIME_MIX_W2 = auto()
TIME_MIX_LERP_X = auto()
TIME_MIX_LERP_K = auto()
TIME_MIX_LERP_V = auto()
TIME_MIX_LERP_R = auto()
TIME_MIX_LERP_G = auto()
TIME_MIX_LERP_W = auto()
TIME_MIX_FIRST = auto()
TIME_MIX_DECAY = auto()
TIME_MIX_DECAY_W1 = auto()
TIME_MIX_DECAY_W2 = auto()
TIME_MIX_KEY = auto()
TIME_MIX_VALUE = auto()
TIME_MIX_RECEPTANCE = auto()
TIME_MIX_GATE = auto()
TIME_MIX_LN = auto()
TIME_MIX_OUTPUT = auto()
CHANNEL_MIX_LERP_K = auto()
CHANNEL_MIX_LERP_R = auto()
CHANNEL_MIX_KEY = auto()
CHANNEL_MIX_RECEPTANCE = auto()
CHANNEL_MIX_VALUE = auto()
ATTN_Q_A = auto() ATTN_Q_A = auto()
ATTN_Q_B = auto() ATTN_Q_B = auto()
ATTN_KV_A_MQA = auto() ATTN_KV_A_MQA = auto()
@ -337,6 +367,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.GEMMA2: "gemma2",
MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.RWKV6: "rwkv6",
MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
@ -400,6 +431,29 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
@ -856,6 +910,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.RWKV6: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_NORM_2,
MODEL_TENSOR.TIME_MIX_W1,
MODEL_TENSOR.TIME_MIX_W2,
MODEL_TENSOR.TIME_MIX_LERP_X,
MODEL_TENSOR.TIME_MIX_LERP_K,
MODEL_TENSOR.TIME_MIX_LERP_V,
MODEL_TENSOR.TIME_MIX_LERP_R,
MODEL_TENSOR.TIME_MIX_LERP_G,
MODEL_TENSOR.TIME_MIX_LERP_W,
MODEL_TENSOR.TIME_MIX_FIRST,
MODEL_TENSOR.TIME_MIX_DECAY,
MODEL_TENSOR.TIME_MIX_DECAY_W1,
MODEL_TENSOR.TIME_MIX_DECAY_W2,
MODEL_TENSOR.TIME_MIX_KEY,
MODEL_TENSOR.TIME_MIX_VALUE,
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
MODEL_TENSOR.TIME_MIX_GATE,
MODEL_TENSOR.TIME_MIX_LN,
MODEL_TENSOR.TIME_MIX_OUTPUT,
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
MODEL_TENSOR.CHANNEL_MIX_LERP_R,
MODEL_TENSOR.CHANNEL_MIX_KEY,
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
MODEL_TENSOR.CHANNEL_MIX_VALUE,
],
MODEL_ARCH.MAMBA: [ MODEL_ARCH.MAMBA: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,

View file

@ -670,6 +670,18 @@ class GGUFWriter:
def add_expert_weights_scale(self, value: float) -> None: def add_expert_weights_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
def add_rescale_every_n_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
def add_time_mix_extra_dim(self, dim: int) -> None:
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
def add_time_decay_extra_dim(self, dim: int) -> None:
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
def add_wkv_head_size(self, size: int) -> None:
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
def add_layer_norm_eps(self, value: float) -> None: def add_layer_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

View file

@ -27,6 +27,7 @@ class TensorNameMap:
"embedding.word_embeddings", # chatglm "embedding.word_embeddings", # chatglm
"transformer.token_embeddings", # openelm "transformer.token_embeddings", # openelm
"shared", # t5 "shared", # t5
"rwkv.embeddings", # rwkv
), ),
# Token type embeddings # Token type embeddings
@ -40,6 +41,7 @@ class TensorNameMap:
"embeddings.LayerNorm", # bert "embeddings.LayerNorm", # bert
"emb_ln", # nomic-bert "emb_ln", # nomic-bert
"transformer.norm", # openelm "transformer.norm", # openelm
"rwkv.blocks.0.pre_ln", # rwkv
), ),
# Position embeddings # Position embeddings
@ -57,6 +59,7 @@ class TensorNameMap:
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
"output_layer", # chatglm "output_layer", # chatglm
"head", # rwkv
), ),
# Output norm # Output norm
@ -76,6 +79,7 @@ class TensorNameMap:
"encoder.final_layernorm", # chatglm "encoder.final_layernorm", # chatglm
"transformer.norm", # openelm "transformer.norm", # openelm
"model.norm", # nemotron "model.norm", # nemotron
"rwkv.ln_out", # rwkv
), ),
# Rope frequencies # Rope frequencies
@ -108,12 +112,14 @@ class TensorNameMap:
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
"encoder.layers.{bid}.input_layernorm", # chatglm "encoder.layers.{bid}.input_layernorm", # chatglm
"transformer.layers.{bid}.attn_norm", # openelm "transformer.layers.{bid}.attn_norm", # openelm
"rwkv.blocks.{bid}.ln1", # rwkv
), ),
# Attention norm 2 # Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ( MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", # falcon40b "transformer.h.{bid}.ln_attn", # falcon40b
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code "encoder.layer.{bid}.layer_norm_1", # jina-v2-code
"rwkv.blocks.{bid}.ln2", # rwkv
), ),
# Attention query-key-value # Attention query-key-value
@ -434,6 +440,98 @@ class TensorNameMap:
"backbone.layers.{bid}.mixer.out_proj", "backbone.layers.{bid}.mixer.out_proj",
), ),
MODEL_TENSOR.TIME_MIX_W1: (
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_W2: (
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_LERP_X: (
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_LERP_K: (
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_LERP_V: (
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_LERP_R: (
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_LERP_G: (
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_LERP_W: (
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_FIRST: (
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_DECAY: (
"rwkv.blocks.{bid}.attention.time_decay", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6
),
MODEL_TENSOR.TIME_MIX_KEY: (
"rwkv.blocks.{bid}.attention.key", # rwkv
),
MODEL_TENSOR.TIME_MIX_VALUE: (
"rwkv.blocks.{bid}.attention.value", # rwkv
),
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.attention.receptance", # rwkv
),
MODEL_TENSOR.TIME_MIX_GATE: (
"rwkv.blocks.{bid}.attention.gate", # rwkv
),
MODEL_TENSOR.TIME_MIX_LN: (
"rwkv.blocks.{bid}.attention.ln_x", # rwkv
),
MODEL_TENSOR.TIME_MIX_OUTPUT: (
"rwkv.blocks.{bid}.attention.output", # rwkv
),
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
),
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
),
MODEL_TENSOR.CHANNEL_MIX_KEY: (
"rwkv.blocks.{bid}.feed_forward.key", # rwkv
),
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
),
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
"rwkv.blocks.{bid}.feed_forward.value", # rwkv
),
MODEL_TENSOR.ATTN_Q_A: ( MODEL_TENSOR.ATTN_Q_A: (
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2 "model.layers.{bid}.self_attn.q_a_proj", # deepseek2
), ),

View file

@ -23,6 +23,7 @@ python = ">=3.8"
numpy = ">=1.17" numpy = ">=1.17"
tqdm = ">=4.27" tqdm = ">=4.27"
pyyaml = ">=5.1" pyyaml = ">=5.1"
sentencepiece = ">=0.1.98,<=0.2.0"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "^5.2" pytest = "^5.2"

View file

@ -120,7 +120,7 @@ You can use GBNF grammars:
- In [llama-server](../examples/server): - In [llama-server](../examples/server):
- For any completion endpoints, passed as the `json_schema` body field - For any completion endpoints, passed as the `json_schema` body field
- For the `/chat/completions` endpoint, passed inside the `result_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}`) - For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}`)
- In [llama-cli](../examples/main), passed as the `--json` / `-j` flag - In [llama-cli](../examples/main), passed as the `--json` / `-j` flag
- To convert to a grammar ahead of time: - To convert to a grammar ahead of time:
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)

View file

@ -66,6 +66,7 @@ extern "C" {
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
}; };
// pre-tokenization types // pre-tokenization types
@ -269,9 +270,9 @@ extern "C" {
enum llama_split_mode split_mode; // how to split the model across multiple GPUs enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode: // main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_LAYER: ignored // LLAMA_SPLIT_MODE_LAYER: ignored
int32_t main_gpu; int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@ -306,8 +307,8 @@ extern "C" {
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
uint32_t n_threads; // number of threads to use for generation int32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing int32_t n_threads_batch; // number of threads to use for batch processing
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
@ -430,6 +431,13 @@ extern "C" {
//optional: //optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa); LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
// Optional: an auto threadpool gets created in ggml if not passed explicitly
LLAMA_API void llama_attach_threadpool(
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch);
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
// Call once at the end of the program - currently only used for MPI // Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void); LLAMA_API void llama_backend_free(void);
@ -839,13 +847,13 @@ extern "C" {
// Set the number of threads used for decoding // Set the number of threads used for decoding
// n_threads is the number of threads used for generation (single token) // n_threads is the number of threads used for generation (single token)
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
// Get the number of threads used for generation of a single token. // Get the number of threads used for generation of a single token.
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx); LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
// Get the number of threads used for prompt and batch processing (multiple token). // Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not // Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not // If true, embeddings will be returned but logits will not

View file

@ -17,7 +17,7 @@ classifiers = [
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.9" python = ">=3.9"
numpy = "^1.25.0" numpy = "^1.25.0"
sentencepiece = ">=0.1.98,<0.2.0" sentencepiece = ">=0.1.98,<=0.2.0"
transformers = ">=4.35.2,<5.0.0" transformers = ">=4.35.2,<5.0.0"
protobuf = ">=4.21.0,<5.0.0" protobuf = ">=4.21.0,<5.0.0"
gguf = { path = "./gguf-py" } gguf = { path = "./gguf-py" }

View file

@ -1 +1 @@
797faa25af14126eb30134d4033139ae3c5428ed 28b7633d733bbeef0026570fbc61c79c5e9aa5ae

View file

@ -31,11 +31,17 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
static void replace_all(std::string & s, const std::string & search, const std::string & replace) { static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) { if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string return;
} }
std::string builder;
builder.reserve(s.length());
size_t pos = 0; size_t pos = 0;
while ((pos = s.find(search, pos)) != std::string::npos) { size_t last_pos = 0;
s.replace(pos, search.length(), replace); while ((pos = s.find(search, last_pos)) != std::string::npos) {
pos += replace.length(); builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
} }
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
} }

View file

@ -58,17 +58,17 @@ struct naive_trie {
auto res = children.find(c); auto res = children.find(c);
if (res != children.end()) { if (res != children.end()) {
return res->second.get_longest_prefix(key, len, offset + 1); return res->second.get_longest_prefix(key, len, offset + 1);
} else { }
return std::make_pair(key, offset); return std::make_pair(key, offset);
} }
} const struct naive_trie * traverse(const char c) const {
struct naive_trie * traverse(const char c) {
auto res = children.find(c); auto res = children.find(c);
if (res != children.end()) { if (res != children.end()) {
return &res->second; return &res->second;
} else {
return NULL;
} }
return NULL;
} }
std::map<char, struct naive_trie> children; std::map<char, struct naive_trie> children;
bool has_value; bool has_value;
@ -843,7 +843,7 @@ struct llm_tokenizer_ugm {
// traverse the token matcher trie to find a matching token // traverse the token matcher trie to find a matching token
bool single_codepoint_token_found = false; bool single_codepoint_token_found = false;
const struct best_tokenization & current_best = tokenization_results[input_offset]; const struct best_tokenization & current_best = tokenization_results[input_offset];
struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]); const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
while (prefix_offset <= input_len && node != NULL) { while (prefix_offset <= input_len && node != NULL) {
// check if we found valid token in prefix // check if we found valid token in prefix
@ -963,7 +963,7 @@ private:
/* /*
* This structure is a view wrapper for XOR-compressed double array (XCDA) * This structure is a view wrapper for XOR-compressed double array (XCDA)
* See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries. * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
* Eeach bit-packed entry contains: * Each bit-packed entry contains:
* - BASE array value in bits 10-30 * - BASE array value in bits 10-30
* - LCHECK array value in bits 0-7 * - LCHECK array value in bits 0-7
* - LEAF array value in bit 9 * - LEAF array value in bit 9
@ -1097,6 +1097,111 @@ private:
struct naive_trie token_matcher; struct naive_trie token_matcher;
}; };
//
// RWKV tokenizer
//
static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
std::vector<uint8_t> output;
output.reserve(escaped.size());
// Parser state
bool escaping = false;
uint8_t hex_remaining = 0;
uint8_t hex_acc = 0;
// Step through characters, performing parsing
for (const char & c : escaped) {
// If we're parsing a hex code, interpret the next character
if (hex_remaining != 0) {
uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
hex_acc = (hex_acc << 4) + value;
hex_remaining -= 1;
if (hex_remaining == 0) {
output.push_back(hex_acc);
hex_acc = 0;
}
continue;
}
// If we got an escape character, interpret it
if (escaping) {
if (c == 't') {
output.push_back('\t');
} else if (c == 'n') {
output.push_back('\n');
} else if (c == 'r') {
output.push_back('\r');
} else if (c == 'x') {
hex_remaining = 2;
} else {
output.push_back(c);
}
escaping = false;
continue;
}
if (c == '\\') {
escaping = true;
continue;
}
output.push_back(c);
}
return output;
}
struct llm_tokenizer_rwkv {
llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) {
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
// For now, we decode the vocab here into the lookup we'll use for tokenization.
// build trie
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
const auto & token = vocab.id_to_token[id];
const auto data = llama_unescape_rwkv_token(token.text);
token_matcher.insert((const char *) data.data(), data.size(), id);
}
}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
uint32_t position = 0;
while (position < text.size()) {
const struct naive_trie * node = token_matcher.traverse(text[position]);
if (node == NULL) {
// no matching token found, add unknown token
output.push_back(vocab.special_unk_id);
position += 1;
continue;
}
// traverse the trie to find the longest matching token
uint32_t token_id = 0;
uint32_t token_length = 0;
while (node != NULL) {
if (node->has_value) {
token_id = node->value;
token_length = position + 1;
}
node = node->traverse(text[++position]);
}
// add the longest matching token
output.push_back(token_id);
position = token_length;
}
}
const llama_vocab & vocab;
struct naive_trie token_matcher;
};
// //
// (de-) tokenize // (de-) tokenize
// //
@ -1401,6 +1506,23 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
output.push_back(vocab.special_eos_id); output.push_back(vocab.special_eos_id);
} }
} break; } break;
case LLAMA_VOCAB_TYPE_RWKV:
{
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
llm_tokenizer_rwkv tokenizer(vocab);
tokenizer.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}
} break;
case LLAMA_VOCAB_TYPE_NONE: case LLAMA_VOCAB_TYPE_NONE:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
@ -1616,6 +1738,17 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
} }
break; break;
} }
case LLAMA_VOCAB_TYPE_RWKV: {
std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
// If we don't have enough space, return an error
if (result.size() > (size_t)length) {
return -(int)result.size();
}
memcpy(buf, result.data(), result.size());
return (int)result.size();
}
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }

File diff suppressed because it is too large Load diff

View file

@ -949,6 +949,58 @@ struct test_rms_norm : public test_case {
} }
}; };
// GGML_OP_SSM_CONV
struct test_ssm_conv : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne_a;
const std::array<int64_t, 4> ne_b;
std::string vars() override {
return VARS_TO_STR3(type, ne_a, ne_b);
}
test_ssm_conv(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
: type(type), ne_a(ne_a), ne_b(ne_b) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
return out;
}
};
// GGML_OP_SSM_SCAN
struct test_ssm_scan : public test_case {
const ggml_type type;
const int64_t d_state;
const int64_t d_inner;
const int64_t n_seq_tokens;
const int64_t n_seqs;
std::string vars() override {
return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
}
test_ssm_scan(ggml_type type = GGML_TYPE_F32,
int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
: type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * s = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, n_seqs, 1 }.data());
ggml_tensor * x = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
ggml_tensor * dt = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
ggml_tensor * A = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, 1 , 1 }.data());
ggml_tensor * B = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
ggml_tensor * C = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
return out;
}
};
// GGML_OP_MUL_MAT // GGML_OP_MUL_MAT
struct test_mul_mat : public test_case { struct test_mul_mat : public test_case {
const ggml_type type_a; const ggml_type type_a;
@ -1108,6 +1160,58 @@ struct test_sqrt : public test_case {
} }
}; };
// GGML_OP_SIN
struct test_sin : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR2(type, ne);
}
test_sin(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 10})
: type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_sin(ctx, a);
return out;
}
void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
init_tensor_uniform(t, -100.0f, 100.0f);
}
}
};
// GGML_OP_COS
struct test_cos : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR2(type, ne);
}
test_cos(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 10})
: type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_cos(ctx, a);
return out;
}
void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
init_tensor_uniform(t, -100.0f, 100.0f);
}
}
};
// GGML_OP_CLAMP // GGML_OP_CLAMP
struct test_clamp : public test_case { struct test_clamp : public test_case {
const ggml_type type; const ggml_type type;
@ -1652,19 +1756,20 @@ struct test_flash_attn_ext : public test_case {
const bool mask; // use mask const bool mask; // use mask
const float max_bias; // ALiBi const float max_bias; // ALiBi
const float logit_softcap; // Gemma 2
const ggml_type type_KV; const ggml_type type_KV;
std::string vars() override { std::string vars() override {
return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV); return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
} }
double max_nmse_err() override { double max_nmse_err() override {
return 5e-4; return 5e-4;
} }
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16) test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {} : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV)); const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
@ -1673,7 +1778,28 @@ struct test_flash_attn_ext : public test_case {
ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1); ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1); ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr; ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias); ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
return out;
}
};
// GGML_OP_CROSS_ENTROPY_LOSS
struct test_cross_entropy_loss : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR2(type, ne);
}
test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 10})
: type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
return out; return out;
} }
}; };
@ -2239,6 +2365,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps)); test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
} }
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
#if 1 #if 1
for (ggml_type type_a : base_types) { for (ggml_type type_a : base_types) {
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) { for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
@ -2334,6 +2466,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_sqr()); test_cases.emplace_back(new test_sqr());
test_cases.emplace_back(new test_sqrt()); test_cases.emplace_back(new test_sqrt());
test_cases.emplace_back(new test_sin());
test_cases.emplace_back(new test_cos());
test_cases.emplace_back(new test_clamp()); test_cases.emplace_back(new test_clamp());
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5)); test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
@ -2437,11 +2571,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
for (bool mask : { true, false } ) { for (bool mask : { true, false } ) {
for (float max_bias : { 0.0f, 8.0f }) { for (float max_bias : { 0.0f, 8.0f }) {
if (!mask && max_bias > 0.0f) continue; if (!mask && max_bias > 0.0f) continue;
for (float logit_softcap : {0.0f, 10.0f}) {
if (hs != 128 && logit_softcap != 0.0f) continue;
for (int nh : { 32, }) { for (int nh : { 32, }) {
for (int kv : { 512, 1024, }) { for (int kv : { 512, 1024, }) {
for (int nb : { 1, 2, 4, 8, }) { for (int nb : { 1, 2, 4, 8, }) {
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) { for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV)); test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
} }
} }
} }
@ -2449,6 +2585,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
} }
} }
} }
}
test_cases.emplace_back(new test_cross_entropy_loss());
// these tests are disabled to save execution time, but they can be handy for debugging // these tests are disabled to save execution time, but they can be handy for debugging
#if 0 #if 0
@ -2483,7 +2622,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
} }
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
return false;
} }
static void usage(char ** argv) { static void usage(char ** argv) {

View file

@ -1,10 +1,14 @@
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
#include "ggml.h" #include "ggml.h"
#include <cfloat>
#include <cmath> #include <cmath>
#include <cstdint>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cassert> #include <cassert>
#include <initializer_list>
#include <vector>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -217,7 +221,8 @@ static bool check_gradient(
int nargs, int nargs,
float eps, float eps,
float max_error_abs, float max_error_abs,
float max_error_rel) { float max_error_rel,
std::vector<double> expected_vals) {
static int n_threads = -1; static int n_threads = -1;
if (n_threads < 0) { if (n_threads < 0) {
@ -248,9 +253,10 @@ static bool check_gradient(
// ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot"); // ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
bool all_g0_bad = true;
const int nelements = ggml_nelements(x[i]); const int nelements = ggml_nelements(x[i]);
for (int k = 0; k < nelements; ++k) { for (int k = 0; k < nelements; ++k) {
// compute gradient using finite differences // Calculate gradient numerically:
const float x0 = ggml_get_f32_1d(x[i], k); const float x0 = ggml_get_f32_1d(x[i], k);
const float xm = x0 - eps; const float xm = x0 - eps;
const float xp = x0 + eps; const float xp = x0 + eps;
@ -267,6 +273,28 @@ static bool check_gradient(
const double f1 = ggml_get_f32_1d(f, 0); const double f1 = ggml_get_f32_1d(f, 0);
const double g0 = (f0 - f1)/(2.0*(double) eps); const double g0 = (f0 - f1)/(2.0*(double) eps);
// The numerical calculation of the gradient fails around noncontinuities (e.g. 0 for ReLU).
// In such cases, provide a vector of expected values and skip the comparison for failed calculations.
if (!expected_vals.empty()) {
bool matches_any = false;
for (const double & ev : expected_vals) {
const double error_abs = std::fabs(g0 - ev);
if (error_abs > max_error_abs) {
continue;
}
const double error_rel = g0 != 0.0 ? fabs(g0 - ev)/fabs(g0) : 0.0;
if (error_rel > max_error_rel) {
continue;
}
matches_any = true;
break;
}
if (!matches_any) {
continue;
}
}
all_g0_bad = false;
ggml_set_f32_1d(x[i], k, x0); ggml_set_f32_1d(x[i], k, x0);
// compute gradient using backward graph // compute gradient using backward graph
@ -278,7 +306,7 @@ static bool check_gradient(
const double g1 = ggml_get_f32_1d(x[i]->grad, k); const double g1 = ggml_get_f32_1d(x[i]->grad, k);
const double error_abs = fabs(g0 - g1); const double error_abs = fabs(g0 - g1);
const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0; const double error_rel = g0 != 0.0 ? fabs(g0 - g1)/fabs(g0) : 0.0;
if (error_abs > max_error_abs || error_rel > max_error_rel) { if (error_abs > max_error_abs || error_rel > max_error_rel) {
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
@ -287,6 +315,10 @@ static bool check_gradient(
return false; return false;
} }
} }
if (all_g0_bad) {
printf("%s: numerical calculation of the gradient failed for all values\n", op_name);
return false;
}
} }
return true; return true;
@ -404,7 +436,7 @@ int main(int argc, const char ** argv) {
seed_iter = rand(); seed_iter = rand();
unsigned seed = rand(); unsigned seed = rand();
printf("test-grad0: iter:%d/%d\n", iter, niter); printf("test-grad0: iter:%d/%d\n", (iter+1), niter);
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
get_random_dims(ne, 4); get_random_dims(ne, 4);
@ -424,7 +456,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f); check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f, {});
} }
} }
@ -441,7 +473,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f); check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f, {});
} }
} }
@ -458,7 +490,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
} }
} }
@ -475,7 +507,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -492,7 +524,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f); check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f, {});
} }
} }
@ -509,7 +541,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -526,7 +558,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f); check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f, {});
} }
} }
@ -543,7 +575,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f); check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f, {});
} }
} }
@ -560,7 +592,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, x[0]); struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
} }
} }
@ -578,7 +610,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0]))); struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
} }
} }
@ -596,7 +628,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
} }
} }
@ -614,7 +646,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
} }
} }
@ -637,7 +669,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1])))); struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
} }
} }
@ -660,25 +692,25 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0])))); struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY, {});
} }
} }
// abs (finite differences do not work) // abs
//{ {
// const int nargs = 1; const int nargs = 1;
// for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
// for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
// x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
// ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
// } }
// struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
// check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f); check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f, {-1.0, 1.0});
// } }
//} }
// sgn // sgn
{ {
@ -693,7 +725,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0])); struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
} }
} }
@ -710,7 +742,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0])); struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
} }
} }
@ -727,7 +759,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0])); struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {0.0});
} }
} }
@ -745,7 +777,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0])); struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
} }
} }
@ -776,7 +808,7 @@ int main(int argc, const char ** argv) {
GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims); GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
if (ndims == 2) { if (ndims == 2) {
// check_mat_mul does not support ndims > 2 // check_mat_mul does not support ndims > 2
check_mat_mul(m, x[1], x[0]); check_mat_mul(m, x[1], x[0]);
@ -800,7 +832,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0])); struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
} }
} }
@ -817,7 +849,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0])); struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {0.0, 1.0});
} }
} }
@ -835,7 +867,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0])); struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f, {});
} }
} }
@ -854,9 +886,9 @@ int main(int argc, const char ** argv) {
#ifdef GGML_SILU_FP16 #ifdef GGML_SILU_FP16
// due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds. // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY); check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY, {});
#else #else
check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
#endif #endif
} }
} }
@ -874,7 +906,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY); check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY, {});
} }
} }
@ -892,7 +924,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -910,7 +942,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -928,7 +960,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY); check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
} }
} }
@ -952,7 +984,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -976,7 +1008,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1004,7 +1036,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1037,7 +1069,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1072,7 +1104,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1109,7 +1141,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1137,7 +1169,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1170,7 +1202,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1194,7 +1226,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1225,7 +1257,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1257,7 +1289,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1291,7 +1323,7 @@ int main(int argc, const char ** argv) {
// sum requires contiguous tensor rows // sum requires contiguous tensor rows
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3))); struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1319,7 +1351,7 @@ int main(int argc, const char ** argv) {
// sum requires contiguous tensor rows // sum requires contiguous tensor rows
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0]))); struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1337,7 +1369,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
// diag_mask_inf // diag_mask_inf
@ -1353,7 +1385,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
// diag_mask_zero // diag_mask_zero
@ -1369,7 +1401,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
// softmax // softmax
@ -1395,7 +1427,7 @@ int main(int argc, const char ** argv) {
1.0f - eps), 1.0f - eps),
ggml_new_f32(ctx0, eps)))); ggml_new_f32(ctx0, eps))));
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY); check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY, {});
// NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf. // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
// this may result in different gradients too finite differences. // this may result in different gradients too finite differences.
// when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause. // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
@ -1412,7 +1444,7 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 4); get_random_dims(ne2, 4);
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f); x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
// the second argument to cross_entropy_loss must sum up to 1 for each row // the second argument to cross_entropy_loss must sum up to 1 for each row
int nr = ggml_nrows(x[1]); int nr = ggml_nrows(x[1]);
@ -1430,7 +1462,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]); struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY); check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, {});
} }
} }
@ -1468,7 +1500,7 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode); GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY); check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
} }
} }
} }
@ -1508,12 +1540,93 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode); GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY); check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY, {});
} }
} }
} }
} }
// im2col f32
{
srand(seed);
const int nargs = 1;
const int ndims = 4;
for (const bool is_2D : {false, true}) {
int64_t ne0[ndims];
int64_t ne1[ndims];
get_random_dims(ne0, ndims);
get_random_dims(ne1, ndims);
// // Ensure that the output is not zero-sized:
ne1[0] += 8;
ne1[1] += 8;
if (is_2D) {
ne1[2] = ne0[2];
} else {
ne1[1] = ne0[1];
ne0[3] = 1;
ne1[3] = 1;
}
// The order of arguments is swapped because the first tensor is only used for its shape.
x[1] = get_random_tensor_f16(ctx0, ndims, ne0, -1.0f, 1.0f);
x[0] = get_random_tensor_f32(ctx0, ndims, ne1, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]);
const int s0 = 1 + irand(2);
const int s1 = is_2D ? 1 + irand(2) : 0;
const int p0 = 0 + irand(2);
const int p1 = is_2D ? 0 + irand(2) : 0;
const int d0 = 1 + irand(2);
const int d1 = is_2D ? 1 + irand(2) : 0;
struct ggml_tensor * f = ggml_sum(ctx0, ggml_im2col(ctx0, x[1], x[0], s0, s1, p0, p1, d0, d1, is_2D, GGML_TYPE_F32));
GGML_PRINT_DEBUG("im2col f32: is_2D=%s, s0=%d, s1=%d, p0=%d, p1=%d, d0=%d, d1=%d\n", is_2D ? "yes" : "no", s0, s1, p0, p1, d0, d1);
check_gradient("im2col f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY, {});
}
}
// pool_2d f32
{
srand(seed);
const int nargs = 1;
const int ndims = 4;
for (const enum ggml_op_pool op : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
int64_t ne0[ndims];
get_random_dims(ne0, ndims);
ne0[0] += 8;
ne0[1] += 8;
x[0] = get_random_tensor_f32(ctx0, ndims, ne0, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]);
const int k0 = 2 + irand(2);
const int k1 = 2 + irand(2);
const int s0 = 2 + irand(2);
const int s1 = 2 + irand(2);
const int p0 = 0 + irand(2);
const int p1 = 0 + irand(2);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_pool_2d(ctx0, x[0], op, k0, k1, s0, s1, p0, p1));
GGML_PRINT_DEBUG("ggml_pool_2d f32: op=%s k0=%d, k1=%d, s0=%d, s1=%d, p0=%d, p1=%d\n",
op == GGML_OP_POOL_MAX ? "max" : "avg", k0, k1, s0, s1, p0, p1);
std::vector<double> expected_vals;
if (op == GGML_OP_POOL_MAX) {
expected_vals.push_back(0.0);
expected_vals.push_back(1.0);
}
check_gradient("ggml_pool_2d f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY, expected_vals);
}
}
// flash_attn f32 // flash_attn f32
// TODO: adapt to ggml_flash_attn_ext() changes // TODO: adapt to ggml_flash_attn_ext() changes
//{ //{
@ -1553,7 +1666,7 @@ int main(int argc, const char ** argv) {
// struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); // struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
// check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY); // check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY, {});
// } // }
// } // }
// } // }

View file

@ -14,7 +14,7 @@ MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO
# Clone the Hugging Face repository if the directory does not exist # Clone the Hugging Face repository if the directory does not exist
if [ ! -d "$MODELS_REPO" ]; then if [ ! -d "$MODELS_REPO" ]; then
echo "Cloning the Hugging Face repository..." echo "Cloning the Hugging Face repository..."
git clone $MODELS_REPO_URL git clone $MODELS_REPO_URL --depth 1
else else
echo "Repository already exists. Skipping clone." echo "Repository already exists. Skipping clone."
fi fi

View file

@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
} }
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) { if (plan.work_size > 0) {
buf.resize(plan.work_size); buf.resize(plan.work_size);

View file

@ -166,12 +166,12 @@ static void test_sampler_queue(
for (auto s : samplers_sequence) { for (auto s : samplers_sequence) {
switch (s){ switch (s){
case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break; case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
case 'f': GGML_ABORT("tail_free test not implemented"); break; case 'f': GGML_ABORT("tail_free test not implemented");
case 'y': GGML_ABORT("typical test not implemented"); break; case 'y': GGML_ABORT("typical test not implemented");
case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break; case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break; case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
case 't': GGML_ABORT("temperature test not implemented"); break; case 't': GGML_ABORT("temperature test not implemented");
default : GGML_ABORT("Unknown sampler"); break; default : GGML_ABORT("Unknown sampler");
} }
llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests