Update branch

This commit is contained in:
Andrei Betlen 2024-09-29 15:10:27 -04:00
commit f1fb9141d2
248 changed files with 34592 additions and 19660 deletions

View file

@ -1,18 +1,16 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1 ARG CUDA_VERSION=12.6.0
# Target the CUDA build image # Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt
COPY requirements requirements COPY requirements requirements
@ -24,13 +22,12 @@ WORKDIR /app
COPY . . COPY . .
# Set nvcc architecture # Use the default CUDA archs if not specified
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
# Enable CUDA export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
ENV GGML_CUDA=1 fi && \
# Enable cURL cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
ENV LLAMA_CURL=1 cmake --build build --config Release -j$(nproc) && \
cp build/bin/* .
RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"] ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -0,0 +1,44 @@
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
FROM cosdt/cann:$ASCEND_VERSION AS build
WORKDIR /app
COPY . .
RUN yum install -y gcc g++ cmake make
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
# find libascend_hal.so, because the drive hasn`t been mounted.
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
# TODO: use image with NNRT
FROM cosdt/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
ENTRYPOINT ["/llama-cli" ]

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1 ARG CUDA_VERSION=12.6.0
# Target the CUDA build image # Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image # Target the CUDA runtime image
@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
FROM ${BASE_CUDA_DEV_CONTAINER} AS build FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git apt-get install -y build-essential git cmake
WORKDIR /app WORKDIR /app
COPY . . COPY . .
# Set nvcc architecture # Use the default CUDA archs if not specified
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
# Enable CUDA export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
ENV GGML_CUDA=1 fi && \
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
RUN make -j$(nproc) llama-cli cmake --build build --config Release --target llama-cli -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/llama-cli" ]

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1 ARG CUDA_VERSION=12.6.0
# Target the CUDA build image # Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image # Target the CUDA runtime image
@ -8,31 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
FROM ${BASE_CUDA_DEV_CONTAINER} AS build FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app WORKDIR /app
COPY . . COPY . .
# Set nvcc architecture # Use the default CUDA archs if not specified
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
# Enable CUDA export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
ENV GGML_CUDA=1 fi && \
# Enable cURL cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
ENV LLAMA_CURL=1 cmake --build build --config Release --target llama-server -j$(nproc)
RUN make -j$(nproc) llama-server
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/llama-server /llama-server COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -26,6 +26,8 @@ RUN apt-get update && \
COPY --from=build /app/build/bin/llama-server /llama-server COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV GGML_HIPBLAS=1 ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
# Enable cURL # Enable cURL
ENV LLAMA_CURL=1 ENV LLAMA_CURL=1

View file

@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app rm -rf /app
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -21,6 +21,8 @@ RUN apt-get update && \
COPY --from=build /app/llama-server /llama-server COPY --from=build /app/llama-server /llama-server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -1,13 +1,52 @@
{ inputs, ... }:
{ {
perSystem = perSystem =
{ config, lib, ... }: {
config,
lib,
system,
...
}:
{ {
devShells = devShells =
lib.concatMapAttrs let
(name: package: { pkgs = import inputs.nixpkgs { inherit system; };
${name} = package.passthru.shell; stdenv = pkgs.stdenv;
${name + "-extra"} = package.passthru.shell-extra; scripts = config.packages.python-scripts;
}) in
config.packages; lib.pipe (config.packages) [
(lib.concatMapAttrs (
name: package: {
${name} = pkgs.mkShell {
name = "${name}";
inputsFrom = [ package ];
shellHook = ''
echo "Entering ${name} devShell"
'';
};
"${name}-extra" =
if (name == "python-scripts") then
null
else
pkgs.mkShell {
name = "${name}-extra";
inputsFrom = [
package
scripts
];
# Extra packages that *may* be used by some scripts
packages = [
pkgs.python3Packages.tiktoken
];
shellHook = ''
echo "Entering ${name} devShell"
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
'';
};
}
))
(lib.filterAttrs (name: value: value != null))
];
}; };
} }

View file

@ -26,16 +26,14 @@
config.cudaSupport = true; config.cudaSupport = true;
config.allowUnfreePredicate = config.allowUnfreePredicate =
p: p:
builtins.all builtins.all (
( license:
license: license.free
license.free || builtins.elem license.shortName [
|| builtins.elem license.shortName [ "CUDA EULA"
"CUDA EULA" "cuDNN EULA"
"cuDNN EULA" ]
] ) (p.meta.licenses or [ p.meta.license ]);
)
(p.meta.licenses or [ p.meta.license ]);
}; };
# Ensure dependencies use ROCm consistently # Ensure dependencies use ROCm consistently
pkgsRocm = import inputs.nixpkgs { pkgsRocm = import inputs.nixpkgs {

View file

@ -0,0 +1,36 @@
{
lib,
llamaVersion,
numpy,
tqdm,
sentencepiece,
pyyaml,
poetry-core,
buildPythonPackage,
pytestCheckHook,
}:
buildPythonPackage {
pname = "gguf";
version = llamaVersion;
pyproject = true;
nativeBuildInputs = [ poetry-core ];
propagatedBuildInputs = [
numpy
tqdm
sentencepiece
pyyaml
];
src = lib.cleanSource ../../gguf-py;
pythonImportsCheck = [
"numpy"
"gguf"
];
nativeCheckInputs = [ pytestCheckHook ];
doCheck = true;
meta = with lib; {
description = "Python package for writing binary files in the GGUF format";
license = licenses.mit;
maintainers = [ maintainers.ditsuke ];
};
}

View file

@ -3,31 +3,33 @@
glibc, glibc,
config, config,
stdenv, stdenv,
mkShell,
runCommand, runCommand,
cmake, cmake,
ninja, ninja,
pkg-config, pkg-config,
git, git,
python3,
mpi, mpi,
blas, blas,
cudaPackages, cudaPackages,
autoAddDriverRunpath,
darwin, darwin,
rocmPackages, rocmPackages,
vulkan-headers, vulkan-headers,
vulkan-loader, vulkan-loader,
curl, curl,
shaderc, shaderc,
useBlas ? builtins.all (x: !x) [ useBlas ?
useCuda builtins.all (x: !x) [
useMetalKit useCuda
useRocm useMetalKit
useVulkan useRocm
] && blas.meta.available, useVulkan
]
&& blas.meta.available,
useCuda ? config.cudaSupport, useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin, useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
useMpi ? false, # Increases the runtime closure size by ~700M # Increases the runtime closure size by ~700M
useMpi ? false,
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
enableCurl ? true, enableCurl ? true,
useVulkan ? false, useVulkan ? false,
@ -37,8 +39,8 @@
# otherwise we get libstdc++ errors downstream. # otherwise we get libstdc++ errors downstream.
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv, effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic, enableStatic ? effectiveStdenv.hostPlatform.isStatic,
precompileMetalShaders ? false precompileMetalShaders ? false,
}@inputs: }:
let let
inherit (lib) inherit (lib)
@ -46,7 +48,6 @@ let
cmakeFeature cmakeFeature
optionals optionals
strings strings
versionOlder
; ;
stdenv = throw "Use effectiveStdenv instead"; stdenv = throw "Use effectiveStdenv instead";
@ -62,54 +63,11 @@ let
pnameSuffix = pnameSuffix =
strings.optionalString (suffices != [ ]) strings.optionalString (suffices != [ ])
"-${strings.concatMapStringsSep "-" strings.toLower suffices}"; "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
descriptionSuffix = descriptionSuffix = strings.optionalString (
strings.optionalString (suffices != [ ]) suffices != [ ]
", accelerated with ${strings.concatStringsSep ", " suffices}"; ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable; xcrunHost = runCommand "xcrunHost" { } ''
# TODO: package the Python in this repository in a Nix-like way.
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
# https://peps.python.org/pep-0517/
#
# TODO: Package up each Python script or service appropriately, by making
# them into "entrypoints"
llama-python = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
]
);
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
llama-python-extra = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
ps.tiktoken
ps.torchWithoutCuda
ps.transformers
# server bench
ps.matplotlib
# server tests
ps.openai
ps.behave
ps.prometheus-client
# for examples/pydantic-models-to-grammar-examples.py
ps.docstring-parser
ps.pydantic
# for scripts/compare-llama-bench.py
ps.gitpython
ps.tabulate
]
);
xcrunHost = runCommand "xcrunHost" {} ''
mkdir -p $out/bin mkdir -p $out/bin
ln -s /usr/bin/xcrun $out/bin ln -s /usr/bin/xcrun $out/bin
''; '';
@ -144,181 +102,145 @@ let
]; ];
in in
effectiveStdenv.mkDerivation ( effectiveStdenv.mkDerivation (finalAttrs: {
finalAttrs: { pname = "llama-cpp${pnameSuffix}";
pname = "llama-cpp${pnameSuffix}"; version = llamaVersion;
version = llamaVersion;
# Note: none of the files discarded here are visible in the sandbox or # Note: none of the files discarded here are visible in the sandbox or
# affect the output hash. This also means they can be modified without # affect the output hash. This also means they can be modified without
# triggering a rebuild. # triggering a rebuild.
src = lib.cleanSourceWith { src = lib.cleanSourceWith {
filter = filter =
name: type: name: type:
let let
noneOf = builtins.all (x: !x); noneOf = builtins.all (x: !x);
baseName = baseNameOf name; baseName = baseNameOf name;
in in
noneOf [ noneOf [
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
(lib.hasPrefix "." baseName) # Skip hidden files and directories (lib.hasPrefix "." baseName) # Skip hidden files and directories
(baseName == "flake.lock") (baseName == "flake.lock")
];
src = lib.cleanSource ../../.;
};
postPatch = ''
substituteInPlace ./ggml/src/ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml/src/ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
'';
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
# `default.metallib` may be compiled with Metal compiler from XCode
# and we need to escape sandbox on MacOS to access Metal compiler.
# `xcrun` is used find the path of the Metal compiler, which is varible
# and not on $PATH
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
nativeBuildInputs =
[
cmake
ninja
pkg-config
git
]
++ optionals useCuda [
cudaPackages.cuda_nvcc
# TODO: Replace with autoAddDriverRunpath
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
cudaPackages.autoAddOpenGLRunpathHook
]
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
glibc.static
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
xcrunHost
]; ];
src = lib.cleanSource ../../.;
};
buildInputs = postPatch = ''
optionals effectiveStdenv.isDarwin darwinBuildInputs substituteInPlace ./ggml/src/ggml-metal.m \
++ optionals useCuda cudaBuildInputs --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
++ optionals useMpi [ mpi ] substituteInPlace ./ggml/src/ggml-metal.m \
++ optionals useRocm rocmBuildInputs --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
++ optionals useBlas [ blas ] '';
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];
cmakeFlags = # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
[ # `default.metallib` may be compiled with Metal compiler from XCode
(cmakeBool "LLAMA_BUILD_SERVER" true) # and we need to escape sandbox on MacOS to access Metal compiler.
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) # `xcrun` is used find the path of the Metal compiler, which is varible
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) # and not on $PATH
(cmakeBool "LLAMA_CURL" enableCurl) # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
(cmakeBool "GGML_NATIVE" false) __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda) nativeBuildInputs =
(cmakeBool "GGML_HIPBLAS" useRocm) [
(cmakeBool "GGML_METAL" useMetalKit) cmake
(cmakeBool "GGML_VULKAN" useVulkan) ninja
(cmakeBool "GGML_STATIC" enableStatic) pkg-config
] git
++ optionals useCuda [ ]
( ++ optionals useCuda [
with cudaPackages.flags; cudaPackages.cuda_nvcc
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
builtins.concatStringsSep ";" (map dropDot cudaCapabilities) autoAddDriverRunpath
) ]
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
buildInputs =
optionals effectiveStdenv.isDarwin darwinBuildInputs
++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];
cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIPBLAS" useRocm)
(cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic)
]
++ optionals useCuda [
(
with cudaPackages.flags;
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
) )
] )
++ optionals useRocm [ ]
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") ++ optionals useRocm [
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets)) (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
] (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
++ optionals useMetalKit [ ]
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ++ optionals useMetalKit [
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
]; (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
];
# Environment variables needed for ROCm # Environment variables needed for ROCm
env = optionals useRocm { env = optionals useRocm {
ROCM_PATH = "${rocmPackages.clr}"; ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode"; HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
}; };
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet. # if they haven't been added yet.
postInstall = '' postInstall = ''
mkdir -p $out/include mkdir -p $out/include
cp $src/include/llama.h $out/include/ cp $src/include/llama.h $out/include/
''; '';
# Define the shells here, but don't add in the inputsFrom to avoid recursion. meta = {
passthru = { # Configurations we don't want even the CI to evaluate. Results in the
inherit # "unsupported platform" messages. This is mostly a no-op, because
useBlas # cudaPackages would've refused to evaluate anyway.
useCuda badPlatforms = optionals useCuda lib.platforms.darwin;
useMetalKit
useMpi
useRocm
useVulkan
;
shell = mkShell { # Configurations that are known to result in build failures. Can be
name = "shell-${finalAttrs.finalPackage.name}"; # overridden by importing Nixpkgs with `allowBroken = true`.
description = "contains numpy and sentencepiece"; broken = (useMetalKit && !effectiveStdenv.isDarwin);
buildInputs = [ llama-python ];
inputsFrom = [ finalAttrs.finalPackage ];
shellHook = ''
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
'';
};
shell-extra = mkShell { description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
name = "shell-extra-${finalAttrs.finalPackage.name}"; homepage = "https://github.com/ggerganov/llama.cpp/";
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers"; license = lib.licenses.mit;
buildInputs = [ llama-python-extra ];
inputsFrom = [ finalAttrs.finalPackage ];
};
};
meta = { # Accommodates `nix run` and `lib.getExe`
# Configurations we don't want even the CI to evaluate. Results in the mainProgram = "llama-cli";
# "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals useCuda lib.platforms.darwin;
# Configurations that are known to result in build failures. Can be # These people might respond, on the best effort basis, if you ping them
# overridden by importing Nixpkgs with `allowBroken = true`. # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
broken = (useMetalKit && !effectiveStdenv.isDarwin); # Consider adding yourself to this list if you want to ensure this flake
# stays maintained and you're willing to invest your time. Do not add
# other people without their consent. Consider removing people after
# they've been unreachable for long periods of time.
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; # Note that lib.maintainers is defined in Nixpkgs, but you may just add
homepage = "https://github.com/ggerganov/llama.cpp/"; # an attrset following the same format as in
license = lib.licenses.mit; # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
maintainers = with lib.maintainers; [
philiptaron
SomeoneSerge
];
# Accommodates `nix run` and `lib.getExe` # Extend `badPlatforms` instead
mainProgram = "llama-cli"; platforms = lib.platforms.all;
};
# These people might respond, on the best effort basis, if you ping them })
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
# Consider adding yourself to this list if you want to ensure this flake
# stays maintained and you're willing to invest your time. Do not add
# other people without their consent. Consider removing people after
# they've been unreachable for long periods of time.
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
# an attrset following the same format as in
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
maintainers = with lib.maintainers; [
philiptaron
SomeoneSerge
];
# Extend `badPlatforms` instead
platforms = lib.platforms.all;
};
}
)

View file

@ -0,0 +1,66 @@
{
lib,
stdenv,
buildPythonPackage,
poetry-core,
mkShell,
python3Packages,
gguf-py,
}@inputs:
let
llama-python-deps = with python3Packages; [
numpy
sentencepiece
transformers
protobuf
torchWithoutCuda
gguf-py
tqdm
# for scripts/compare-llama-bench.py
gitpython
tabulate
# for examples/pydantic-models-to-grammar-examples.py
docstring-parser
pydantic
];
llama-python-test-deps = with python3Packages; [
# Server bench
matplotlib
# server tests
openai
behave
prometheus-client
];
in
buildPythonPackage ({
pname = "llama-scripts";
version = "0.0.0";
pyproject = true;
# NOTE: The files filtered out here are not visible in the build sandbox, neither
# do they affect the output hash. They can be modified without triggering a rebuild.
src = lib.cleanSourceWith {
filter =
name: type:
let
any = builtins.any (x: x);
baseName = builtins.baseNameOf name;
in
any [
(lib.hasSuffix ".py" name)
(baseName == "README.md")
(baseName == "pyproject.toml")
];
src = lib.cleanSource ../../.;
};
nativeBuildInputs = [ poetry-core ];
nativeCheckInputs = llama-python-test-deps;
dependencies = llama-python-deps;
})

View file

@ -1,19 +1,41 @@
{ {
lib, lib,
newScope, newScope,
python3,
llamaVersion ? "0.0.0", llamaVersion ? "0.0.0",
}: }:
let
pythonPackages = python3.pkgs;
buildPythonPackage = pythonPackages.buildPythonPackage;
numpy = pythonPackages.numpy;
tqdm = pythonPackages.tqdm;
sentencepiece = pythonPackages.sentencepiece;
pyyaml = pythonPackages.pyyaml;
poetry-core = pythonPackages.poetry-core;
pytestCheckHook = pythonPackages.pytestCheckHook;
in
# We're using `makeScope` instead of just writing out an attrset # We're using `makeScope` instead of just writing out an attrset
# because it allows users to apply overlays later using `overrideScope'`. # because it allows users to apply overlays later using `overrideScope'`.
# Cf. https://noogle.dev/f/lib/makeScope # Cf. https://noogle.dev/f/lib/makeScope
lib.makeScope newScope ( lib.makeScope newScope (self: {
self: { inherit llamaVersion;
inherit llamaVersion; gguf-py = self.callPackage ./package-gguf-py.nix {
llama-cpp = self.callPackage ./package.nix { }; inherit
docker = self.callPackage ./docker.nix { }; buildPythonPackage
docker-min = self.callPackage ./docker.nix { interactive = false; }; numpy
sif = self.callPackage ./sif.nix { }; tqdm
} sentencepiece
) poetry-core
pyyaml
pytestCheckHook
;
};
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
llama-cpp = self.callPackage ./package.nix { };
docker = self.callPackage ./docker.nix { };
docker-min = self.callPackage ./docker.nix { interactive = false; };
sif = self.callPackage ./sif.nix { };
})

View file

@ -1,7 +1,7 @@
*.o *.o
*.a *.a
.cache/ .cache/
.git/ # Do not ignore .git directory, otherwise the reported build number will always be 0
.github/ .github/
.gitignore .gitignore
.vs/ .vs/

2
.ecrc
View file

@ -1,5 +1,5 @@
{ {
"Exclude": ["^\\.gitmodules$"], "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
"Disable": { "Disable": {
"IndentSize": true "IndentSize": true
} }

View file

@ -1,3 +1,6 @@
# TODO: there have been some issues with the workflow, so disabling for now
# https://github.com/ggerganov/llama.cpp/issues/7893
#
# Benchmark # Benchmark
name: Benchmark name: Benchmark
@ -129,6 +132,8 @@ jobs:
- name: Server bench - name: Server bench
id: server_bench id: server_bench
env:
HEAD_REF: ${{ github.head_ref || github.ref_name }}
run: | run: |
set -eux set -eux
@ -137,7 +142,7 @@ jobs:
python bench.py \ python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \ --runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \ --name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \ --branch $HEAD_REF \
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
--scenario script.js \ --scenario script.js \
--duration ${{ github.event.inputs.duration || env.DURATION }} \ --duration ${{ github.event.inputs.duration || env.DURATION }} \

View file

@ -23,6 +23,9 @@ env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3 GGML_NLOOP: 3
GGML_N_THREADS: 1 GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs: jobs:
macOS-latest-cmake-arm64: macOS-latest-cmake-arm64:
@ -47,7 +50,7 @@ jobs:
sysctl -a sysctl -a
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF .. cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -105,7 +108,7 @@ jobs:
sysctl -a sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU: # Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -222,7 +225,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
- name: Test - name: Test
@ -375,7 +378,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -401,7 +404,7 @@ jobs:
continue-on-error: true continue-on-error: true
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: add oneAPI to apt - name: add oneAPI to apt
shell: bash shell: bash
@ -442,7 +445,7 @@ jobs:
continue-on-error: true continue-on-error: true
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: add oneAPI to apt - name: add oneAPI to apt
shell: bash shell: bash
@ -546,7 +549,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -576,7 +579,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -610,7 +613,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -696,22 +699,20 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- build: 'rpc-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx-x64' - build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2-x64' - build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx-x64' - build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64' - build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'openblas-x64' - build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64' - build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan-x64' - build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64' - build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64' - build: 'msvc-arm64'
@ -859,7 +860,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
@ -955,6 +956,7 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done" echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
@ -966,19 +968,20 @@ jobs:
name: llama-bin-win-sycl-x64.zip name: llama-bin-win-sycl-x64.zip
windows-latest-cmake-hip: windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }}
runs-on: windows-latest runs-on: windows-latest
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Install - name: Install
id: depends id: depends
run: | run: |
$ErrorActionPreference = "Stop" $ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer" write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK" write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation" write-host "Completed AMD HIP SDK installation"
@ -993,8 +996,72 @@ jobs:
run: | run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
cmake --build build --config Release cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
windows-latest-cmake-hip-release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: windows-latest
strategy:
matrix:
gpu_target: [gfx1100, gfx1101, gfx1030]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
ios-xcode-build: ios-xcode-build:
runs-on: macos-latest runs-on: macos-latest
@ -1059,6 +1126,7 @@ jobs:
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-cuda - windows-latest-cmake-cuda
- windows-latest-cmake-hip-release
- macOS-latest-cmake-arm64 - macOS-latest-cmake-arm64
- macOS-latest-cmake-x64 - macOS-latest-cmake-x64

View file

@ -15,11 +15,17 @@ on:
branches: branches:
- master - master
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
workflow_dispatch: # allows manual triggering, useful for debugging
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
packages: write
jobs: jobs:
push_to_registry: push_to_registry:
name: Push Docker image to Docker Hub name: Push Docker image to Docker Hub
@ -37,15 +43,17 @@ jobs:
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete. #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v2 uses: docker/setup-qemu-action@v2
@ -60,6 +68,34 @@ jobs:
username: ${{ github.repository_owner }} username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
# determine tag name postfix (build number, commit hash)
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
TAG_POSTFIX="b${BUILD_NUMBER}"
else
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
fi
# list all tags possible
TAGS=""
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
echo "output_tags=$TAGS" # print out for debugging
env:
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
- name: Free Disk Space (Ubuntu) - name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main uses: jlumbroso/free-disk-space@main
@ -77,40 +113,13 @@ jobs:
docker-images: true docker-images: true
swap-storage: true swap-storage: true
- name: Determine tag name - name: Build and push Docker image (tagged + versioned)
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Downcase github.repository_owner
run: |
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Build and push Docker image (versioned)
if: github.event_name == 'push' if: github.event_name == 'push'
uses: docker/build-push-action@v4 uses: docker/build-push-action@v6
with: with:
context: . context: .
push: true push: true
platforms: ${{ matrix.config.platforms }} platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" # tag list is generated from step above
file: ${{ matrix.config.dockerfile }} tags: ${{ steps.tag.outputs.output_tags }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }} file: ${{ matrix.config.dockerfile }}

View file

@ -6,15 +6,13 @@ on:
- '.github/workflows/python-check-requirements.yml' - '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh' - 'scripts/check-requirements.sh'
- 'convert*.py' - 'convert*.py'
- 'requirements.txt' - '**/requirements*.txt'
- 'requirements/*.txt'
pull_request: pull_request:
paths: paths:
- '.github/workflows/python-check-requirements.yml' - '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh' - 'scripts/check-requirements.sh'
- 'convert*.py' - 'convert*.py'
- 'requirements.txt' - '**/requirements*.txt'
- 'requirements/*.txt'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

View file

@ -20,6 +20,12 @@ on:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
env:
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
@ -173,6 +179,7 @@ jobs:
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
$env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests - name: Slow tests

4
.gitignore vendored
View file

@ -61,6 +61,7 @@ llama-batched-swift
/rpc-server /rpc-server
out/ out/
tmp/ tmp/
autogen-*.md
# Deprecated # Deprecated
@ -129,3 +130,6 @@ poetry.toml
# Scripts # Scripts
!/scripts/install-oneapi.bat !/scripts/install-oneapi.bat
# Test models for lora adapters
/lora-tests

View file

@ -62,6 +62,9 @@ option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# utils
option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)
# extra artifacts # extra artifacts
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@ -82,11 +85,11 @@ set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
# change the default for these ggml options # change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE) if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE ON) set(GGML_LLAMAFILE_DEFAULT ON)
endif() endif()
if (NOT DEFINED GGML_CUDA_USE_GRAPHS) if (NOT DEFINED GGML_CUDA_GRAPHS)
set(GGML_CUDA_USE_GRAPHS ON) set(GGML_CUDA_GRAPHS_DEFAULT ON)
endif() endif()
# transition helpers # transition helpers
@ -139,10 +142,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
# determining _precisely_ which defines are necessary for the llama-config # determining _precisely_ which defines are necessary for the llama-config
# package. # package.
# #
set(GGML_TRANSIENT_DEFINES)
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
if (GGML_DIR_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
endif()
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) if (GGML_TARGET_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
@ -185,15 +194,17 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION lib/pkgconfig) DESTINATION lib/pkgconfig)
# #
# programs, examples and tests # utils, programs, examples and tests
# #
add_subdirectory(common) if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest) include(CTest)
add_subdirectory(tests) add_subdirectory(tests)
endif () endif()
if (LLAMA_BUILD_EXAMPLES) if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples) add_subdirectory(examples)

View file

@ -28,11 +28,12 @@
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
{ {
"name": "arm64-windows-msvc", "hidden": true, "name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" }, "architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" }, "toolset": { "value": "host=x64", "strategy": "external" },
"cacheVariables": { "cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake" "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
} }
@ -40,8 +41,8 @@
{ {
"name": "arm64-windows-llvm", "hidden": true, "name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" }, "architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" }, "toolset": { "value": "host=x64", "strategy": "external" },
"cacheVariables": { "cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake" "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
} }
@ -60,6 +61,8 @@
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] }, { "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] } { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
] ]
} }

View file

@ -39,10 +39,12 @@ BUILD_TARGETS = \
llama-tokenize \ llama-tokenize \
llama-vdot \ llama-vdot \
llama-cvector-generator \ llama-cvector-generator \
llama-gen-docs \
tests/test-c.o tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = \ TEST_TARGETS = \
tests/test-arg-parser \
tests/test-autorelease \ tests/test-autorelease \
tests/test-backend-ops \ tests/test-backend-ops \
tests/test-chat-template \ tests/test-chat-template \
@ -52,6 +54,7 @@ TEST_TARGETS = \
tests/test-grammar-parser \ tests/test-grammar-parser \
tests/test-json-schema-to-grammar \ tests/test-json-schema-to-grammar \
tests/test-llama-grammar \ tests/test-llama-grammar \
tests/test-log \
tests/test-model-load-cancel \ tests/test-model-load-cancel \
tests/test-opt \ tests/test-opt \
tests/test-quantize-fns \ tests/test-quantize-fns \
@ -146,6 +149,14 @@ GGML_NO_METAL := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_DISABLE_LOGS
REMOVE_WARNING := 1
endif
ifdef LLAMA_SERVER_VERBOSE
REMOVE_WARNING := 1
endif
ifndef UNAME_S ifndef UNAME_S
UNAME_S := $(shell uname -s) UNAME_S := $(shell uname -s)
endif endif
@ -349,19 +360,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED
MK_LDFLAGS += -fsanitize=undefined -g MK_LDFLAGS += -fsanitize=undefined -g
endif endif
ifdef LLAMA_SERVER_VERBOSE
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
ifdef LLAMA_SERVER_SSL ifdef LLAMA_SERVER_SSL
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
MK_LDFLAGS += -lssl -lcrypto MK_LDFLAGS += -lssl -lcrypto
endif endif
ifdef LLAMA_DISABLE_LOGS
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS
# warnings # warnings
WARN_FLAGS = \ WARN_FLAGS = \
-Wall \ -Wall \
@ -432,7 +435,7 @@ endif
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
ifndef RISCV ifndef RISCV_CROSS_COMPILE
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
@ -512,7 +515,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
MK_CXXFLAGS += -mlasx MK_CXXFLAGS += -mlasx
endif endif
else ifneq ($(filter riscv64%,$(UNAME_M)),)
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
else # RISC-V CROSS COMPILATION
MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
@ -603,7 +611,7 @@ ifdef GGML_CUDA
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64 MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22 MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
else else
ifneq ('', '$(wildcard /opt/cuda)') ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda CUDA_PATH ?= /opt/cuda
@ -611,7 +619,7 @@ ifdef GGML_CUDA
CUDA_PATH ?= /usr/local/cuda CUDA_PATH ?= /usr/local/cuda
endif endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math MK_NVCCFLAGS += -use_fast_math
endif # GGML_MUSA endif # GGML_MUSA
@ -763,6 +771,10 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
endif endif
ifdef GGML_VULKAN_PERF
MK_CPPFLAGS += -DGGML_VULKAN_PERF
endif
ifdef GGML_VULKAN_VALIDATE ifdef GGML_VULKAN_VALIDATE
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
endif endif
@ -919,11 +931,12 @@ OBJ_LLAMA = \
OBJ_COMMON = \ OBJ_COMMON = \
common/common.o \ common/common.o \
common/arg.o \
common/log.o \
common/console.o \ common/console.o \
common/ngram-cache.o \ common/ngram-cache.o \
common/sampling.o \ common/sampling.o \
common/train.o \ common/train.o \
common/grammar-parser.o \
common/build-info.o \ common/build-info.o \
common/json-schema-to-grammar.o common/json-schema-to-grammar.o
@ -1016,6 +1029,14 @@ $(info - LLAMA_NO_CCACHE)
$(info ) $(info )
endif endif
ifdef REMOVE_WARNING
$(info !!! REMOVAL WARNING !!!)
$(info The following LLAMA_ options have been removed and are no longer supported)
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
$(info )
endif
# #
# Build libraries # Build libraries
# #
@ -1152,6 +1173,16 @@ common/common.o: \
include/llama.h include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common/arg.o: \
common/arg.cpp \
common/arg.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/log.o: \
common/log.cpp \
common/log.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/sampling.o: \ common/sampling.o: \
common/sampling.cpp \ common/sampling.cpp \
common/sampling.h \ common/sampling.h \
@ -1163,11 +1194,6 @@ common/console.o: \
common/console.h common/console.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common/grammar-parser.o: \
common/grammar-parser.cpp \
common/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/json-schema-to-grammar.o: \ common/json-schema-to-grammar.o: \
common/json-schema-to-grammar.cpp \ common/json-schema-to-grammar.cpp \
common/json-schema-to-grammar.h common/json-schema-to-grammar.h
@ -1335,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
$(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1429,6 +1455,7 @@ llama-server: \
examples/server/system-prompts.js.hpp \ examples/server/system-prompts.js.hpp \
examples/server/prompt-formats.js.hpp \ examples/server/prompt-formats.js.hpp \
examples/server/json-schema-to-grammar.mjs.hpp \ examples/server/json-schema-to-grammar.mjs.hpp \
examples/server/loading.html.hpp \
common/json.hpp \ common/json.hpp \
common/stb_image.h \ common/stb_image.h \
$(OBJ_ALL) $(OBJ_ALL)
@ -1444,6 +1471,11 @@ examples/server/%.hpp: examples/server/public/% Makefile
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@ ) > $@
llama-gen-docs: examples/gen-docs/gen-docs.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
libllava.a: examples/llava/llava.cpp \ libllava.a: examples/llava/llava.cpp \
examples/llava/llava.h \ examples/llava/llava.h \
examples/llava/clip.cpp \ examples/llava/clip.cpp \
@ -1501,11 +1533,21 @@ run-benchmark-matmult: llama-benchmark-matmult
.PHONY: run-benchmark-matmult swift .PHONY: run-benchmark-matmult swift
tests/test-arg-parser: tests/test-arg-parser.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp \ tests/test-llama-grammar: tests/test-llama-grammar.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-log: tests/test-log.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp \ tests/test-grammar-parser: tests/test-grammar-parser.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

View file

@ -10,32 +10,15 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
## Recent API changes ## Recent API changes
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006 - [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 - [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
## Hot topics ## Hot topics
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430 - **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021 - Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
---- ----
@ -95,6 +78,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo) - [x] [OLMo](https://allenai.org/olmo)
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520) - [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
@ -105,6 +89,9 @@ Typically finetunes of the base models below are supported as well.
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca) - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
@ -126,6 +113,7 @@ Typically finetunes of the base models below are supported as well.
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
@ -179,6 +167,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT) - [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
@ -186,10 +175,13 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML - [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
**Infrastructure:** **Infrastructure:**
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
**Games:** **Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
@ -422,6 +414,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
| [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU | | [Vulkan](./docs/build.md#vulkan) | GPU |
| [CANN](./docs/build.md#cann) | Ascend NPU |
## Tools ## Tools

View file

@ -13,6 +13,9 @@
# # with SYCL support # # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# #
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>" echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi fi
if [ ! -z ${GG_BUILD_CUDA} ]; then if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1" CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
fi fi
if [ ! -z ${GG_BUILD_SYCL} ]; then if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
## helpers ## helpers
# download a file if it does not exist or if it is outdated # download a file if it does not exist or if it is outdated
@ -107,7 +114,7 @@ function gg_run_ctest_debug {
gg_check_build_requirements gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -138,7 +145,7 @@ function gg_run_ctest_release {
gg_check_build_requirements gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
} }
# open_llama_7b_v2 # open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_open_llama_7b_v2 { function gg_run_open_llama_7b_v2 {
cd ${SRC} cd ${SRC}
@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
} }
# pythia_2_8b # pythia_2_8b
# requires: GG_BUILD_CUDA
function gg_run_pythia_2_8b { function gg_run_pythia_2_8b {
cd ${SRC} cd ${SRC}
@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -732,6 +737,9 @@ function gg_sum_embd_bge_small {
## main ## main
export LLAMA_LOG_PREFIX=1
export LLAMA_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
rm -rf ${SRC}/models-mnt rm -rf ${SRC}/models-mnt
@ -761,7 +769,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
test $ret -eq 0 && gg_run pythia_1_4b test $ret -eq 0 && gg_run pythia_1_4b
else else
test $ret -eq 0 && gg_run pythia_2_8b test $ret -eq 0 && gg_run pythia_2_8b

View file

@ -51,21 +51,23 @@ endif()
set(TARGET common) set(TARGET common)
add_library(${TARGET} STATIC add_library(${TARGET} STATIC
arg.cpp
arg.h
base64.hpp base64.hpp
common.h
common.cpp common.cpp
sampling.h common.h
sampling.cpp
console.h
console.cpp console.cpp
grammar-parser.h console.h
grammar-parser.cpp
json.hpp
json-schema-to-grammar.cpp json-schema-to-grammar.cpp
train.h json.hpp
train.cpp log.cpp
ngram-cache.h log.h
ngram-cache.cpp ngram-cache.cpp
ngram-cache.h
sampling.cpp
sampling.h
train.cpp
train.h
) )
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)

1994
common/arg.cpp Normal file

File diff suppressed because it is too large Load diff

77
common/arg.h Normal file
View file

@ -0,0 +1,77 @@
#pragma once
#include "common.h"
#include <set>
#include <string>
#include <vector>
//
// CLI argument parsing
//
struct llama_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr;
std::string help;
bool is_sparam = false; // is current arg a sampling param?
void (*handler_void) (gpt_params & params) = nullptr;
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (gpt_params & params, int) = nullptr;
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &)
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, int)
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const std::string & help,
void (*handler)(gpt_params & params)
) : args(args), help(help), handler_void(handler) {}
// support 2 values for arg
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const char * value_hint_2,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &, const std::string &)
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
llama_arg & set_env(const char * env);
llama_arg & set_sparam();
bool in_example(enum llama_example ex);
bool get_value_from_env(std::string & output);
bool has_value_from_env();
std::string to_string();
};
struct gpt_params_context {
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
gpt_params & params;
std::vector<llama_arg> options;
void(*print_usage)(int, char **) = nullptr;
gpt_params_context(gpt_params & params) : params(params) {}
};
// parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
// function to be used by test-arg-parser
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

File diff suppressed because it is too large Load diff

View file

@ -4,18 +4,9 @@
#include "llama.h" #include "llama.h"
#include "sampling.h"
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
#include <cmath>
#include <string> #include <string>
#include <vector> #include <vector>
#include <random> #include <sstream>
#include <thread>
#include <unordered_map>
#include <tuple>
#ifdef _WIN32 #ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\' #define DIRECTORY_SEPARATOR '\\'
@ -54,26 +45,103 @@ struct llama_control_vector_load_info;
// CPU utils // CPU utils
// //
struct cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};
int32_t cpu_get_num_physical_cores(); int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math(); int32_t cpu_get_num_math();
// //
// CLI argument parsing // Common params
// //
enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_INFILL,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
LLAMA_EXAMPLE_PASSKEY,
LLAMA_EXAMPLE_IMATRIX,
LLAMA_EXAMPLE_BENCH,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_COUNT,
};
enum gpt_sampler_type {
GPT_SAMPLER_TYPE_NONE = 0,
GPT_SAMPLER_TYPE_TOP_K = 1,
GPT_SAMPLER_TYPE_TOP_P = 2,
GPT_SAMPLER_TYPE_MIN_P = 3,
GPT_SAMPLER_TYPE_TFS_Z = 4,
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
};
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
enum dimre_method { enum dimre_method {
DIMRE_METHOD_PCA, DIMRE_METHOD_PCA,
DIMRE_METHOD_MEAN, DIMRE_METHOD_MEAN,
}; };
struct gpt_params { // sampler parameters
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed struct gpt_sampler_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_threads = cpu_get_num_math(); int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_threads_draft = -1; int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t n_threads_batch_draft = -1; int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics
std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
GPT_SAMPLER_TYPE_TFS_Z,
GPT_SAMPLER_TYPE_TYPICAL_P,
GPT_SAMPLER_TYPE_TOP_P,
GPT_SAMPLER_TYPE_MIN_P,
GPT_SAMPLER_TYPE_TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string
std::string print() const;
};
struct gpt_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -100,6 +168,11 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold float defrag_thold = -1.0f; // KV cache defragmentation threshold
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;
ggml_backend_sched_eval_callback cb_eval = nullptr; ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr; void * cb_eval_user_data = nullptr;
@ -110,26 +183,25 @@ struct gpt_params {
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
// // sampling parameters struct gpt_sampler_params sparams;
struct llama_sampling_params sparams;
std::string model = ""; // model path std::string model = ""; // model path // NOLINT
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding // NOLINT
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias // NOLINT
std::string model_url = ""; // model url to download std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; std::string prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name std::string prompt_file = ""; // store the external prompt file name // NOLINT
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string logdir = ""; // directory in which to save YAML log files std::string logdir = ""; // directory in which to save YAML log files // NOLINT
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string rpc_servers = ""; // comma separated list of RPC servers std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@ -173,15 +245,15 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
bool logits_all = false; // return logits for all tokens in the batch bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run bool warmup = true; // warmup run
@ -191,7 +263,7 @@ struct gpt_params {
std::string cache_type_v = "f16"; // KV cache data type for the V std::string cache_type_v = "f16"; // KV cache data type for the V
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector // NOLINT
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// embedding // embedding
@ -204,18 +276,18 @@ struct gpt_params {
int32_t port = 8080; // server listens on this network port int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; std::string public_path = ""; // NOLINT
std::string chat_template = ""; std::string chat_template = ""; // NOLINT
std::string system_prompt = ""; std::string system_prompt = ""; // NOLINT
bool enable_chat_template = true; bool enable_chat_template = true;
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
std::string ssl_file_key = ""; std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; std::string ssl_file_cert = ""; // NOLINT
bool endpoint_slots = true; bool endpoint_slots = true;
bool endpoint_metrics = false; bool endpoint_metrics = false;
@ -265,18 +337,22 @@ struct gpt_params {
bool spm_infill = false; // suffix/prefix/middle pattern for infill bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf"; std::string lora_outfile = "ggml-lora-merged-f16.gguf";
// batched-bench params
bool batched_bench_output_jsonl = false;
}; };
void gpt_params_handle_hf_token(gpt_params & params); // call once at the start of a program if it uses libcommon
void gpt_params_handle_model_default(gpt_params & params); // initializes the logging system and prints info about the build
void gpt_init();
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_params_get_system_info(const gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);
// //
// String utils // String utils
// //
@ -305,6 +381,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
std::string string_from(bool value);
std::string string_from(const std::vector<int> & values);
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
// //
// Filesystem utils // Filesystem utils
// //
@ -327,8 +408,9 @@ struct llama_init_result {
struct llama_init_result llama_init_from_gpt_params(gpt_params & params); struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
@ -380,10 +462,6 @@ std::string llama_detokenize(
const std::vector<llama_token> & tokens, const std::vector<llama_token> & tokens,
bool special = true); bool special = true);
// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
// //
// Chat template utils // Chat template utils
// //

View file

@ -1,536 +0,0 @@
#include "grammar-parser.h"
#include <cstdint>
#include <cwchar>
#include <string>
#include <utility>
#include <stdexcept>
#include <exception>
namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4;
int len = lookup[highbits];
uint8_t mask = (1 << (8 - len)) - 1;
uint32_t value = first_byte & mask;
const char * end = src + len; // may overrun!
const char * pos = src + 1;
for ( ; pos < end && *pos; pos++) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
}
return std::make_pair(value, pos);
}
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
return result.first->second;
}
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id;
}
static void add_rule(
parse_state & state,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) {
if (state.rules.size() <= rule_id) {
state.rules.resize(rule_id + 1);
}
state.rules[rule_id] = rule;
}
static bool is_digit_char(char c) {
return '0' <= c && c <= '9';
}
static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
}
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src;
const char * end = src + size;
uint32_t value = 0;
for ( ; pos < end && *pos; pos++) {
value <<= 4;
char c = *pos;
if ('a' <= c && c <= 'f') {
value += c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
value += c - 'A' + 10;
} else if ('0' <= c && c <= '9') {
value += c - '0';
} else {
break;
}
}
if (pos != end) {
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
}
return std::make_pair(value, pos);
}
static const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
if (*pos == '#') {
while (*pos && *pos != '\r' && *pos != '\n') {
pos++;
}
} else {
pos++;
}
}
return pos;
}
static const char * parse_name(const char * src) {
const char * pos = src;
while (is_word_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting name at ") + src);
}
return pos;
}
static const char * parse_int(const char * src) {
const char * pos = src;
while (is_digit_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting integer at ") + src);
}
return pos;
}
static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
case 'x': return parse_hex(src + 2, 2);
case 'u': return parse_hex(src + 2, 4);
case 'U': return parse_hex(src + 2, 8);
case 't': return std::make_pair('\t', src + 2);
case 'r': return std::make_pair('\r', src + 2);
case 'n': return std::make_pair('\n', src + 2);
case '\\':
case '"':
case '[':
case ']':
return std::make_pair(src[1], src + 2);
default:
throw std::runtime_error(std::string("unknown escape at ") + src);
}
} else if (*src) {
return decode_utf8(src);
}
throw std::runtime_error("unexpected end of input");
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
static const char * parse_sequence(
parse_state & state,
const char * src,
const std::string & rule_name,
std::vector<llama_grammar_element> & out_elements,
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
auto handle_repetitions = [&](int min_times, int max_times) {
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// the following rewrite rules:
// S{m,n} --> S S S (m times) S'(n-m)
// S'(x) ::= S S'(x-1) |
// (... n-m definitions of these S' rules ...)
// S'(1) ::= S |
// S{m,} --> S S S (m times) S'
// S' ::= S S' |
// S* --> S{0,}
// --> S' ::= S S' |
// S+ --> S{1,}
// --> S S'
// S' ::= S S' |
// S? --> S{0,1}
// --> S'
// S' ::= S |
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
if (min_times == 0) {
out_elements.resize(last_sym_start);
} else {
// Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) {
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
}
}
uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
std::vector<llama_grammar_element> rec_rule(previous_elements);
for (int i = 0; i < n_opt; i++) {
rec_rule.resize(previous_elements.size());
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
if (i > 0 || max_times < 0) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
}
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rec_rule_id, rec_rule);
last_rec_rule_id = rec_rule_id;
}
if (n_opt > 0) {
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
}
};
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = out_elements.size();
while (*pos != '"') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s)
pos++;
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
if (*pos == '^') {
pos++;
start_type = LLAMA_GRETYPE_CHAR_NOT;
}
last_sym_start = out_elements.size();
while (*pos != ']') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size()
? LLAMA_GRETYPE_CHAR_ALT
: start_type;
out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
if (!pos[1]) {
throw std::runtime_error("unexpected end of input");
}
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
}
}
pos = parse_space(pos + 1, is_nested);
} else if (is_word_char(*pos)) { // rule reference
const char * name_end = parse_name(pos);
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
last_sym_start = out_elements.size();
// output reference to synthesized rule
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '.') { // any char
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, -1);
} else if (*pos == '+') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(1, -1);
} else if (*pos == '?') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, 1);
} else if (*pos == '{') {
pos = parse_space(pos + 1, is_nested);
if (!is_digit_char(*pos)) {
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
int max_times = -1;
if (*pos == '}') {
max_times = min_times;
pos = parse_space(pos + 1, is_nested);
} else if (*pos == ',') {
pos = parse_space(pos + 1, is_nested);
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
if (*pos != '}') {
throw std::runtime_error(std::string("expecting '}' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else {
throw std::runtime_error(std::string("expecting ',' at ") + pos);
}
handle_repetitions(min_times, max_times);
} else {
break;
}
}
return pos;
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested) {
std::vector<llama_grammar_element> rule;
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
while (*pos == '|') {
rule.push_back({LLAMA_GRETYPE_ALT, 0});
pos = parse_space(pos + 1, true);
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
}
rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rule_id, rule);
return pos;
}
static const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
uint32_t rule_id = get_symbol_id(state, src, name_len);
const std::string name(src, name_len);
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
throw std::runtime_error(std::string("expecting ::= at ") + pos);
}
pos = parse_space(pos + 3, true);
pos = parse_alternates(state, pos, name, rule_id, false);
if (*pos == '\r') {
pos += pos[1] == '\n' ? 2 : 1;
} else if (*pos == '\n') {
pos++;
} else if (*pos) {
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
}
return parse_space(pos, true);
}
parse_state parse(const char * src) {
try {
parse_state state;
const char * pos = parse_space(src, true);
while (*pos) {
pos = parse_rule(state, pos);
}
// Validate the state to ensure that all rules are defined
for (const auto & rule : state.rules) {
for (const auto & elem : rule) {
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
// Ensure that the rule at that location exists
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
// Get the name of the rule that is missing
for (const auto & kv : state.symbol_ids) {
if (kv.second == elem.value) {
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
}
}
}
}
}
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
return parse_state();
}
}
static void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c));
} else {
// cop out of encoding UTF-8
fprintf(file, "<U+%04X>", c);
}
}
static bool is_char_element(llama_grammar_element elem) {
switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
case LLAMA_GRETYPE_CHAR_ANY: return true;
default: return false;
}
}
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) {
switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
case LLAMA_GRETYPE_ALT:
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "(%u) ", elem.value);
break;
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
break;
}
}
fprintf(file, "\n");
}
static void print_rule(
FILE * file,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule,
const std::map<uint32_t, std::string> & symbol_id_names) {
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
throw std::runtime_error(
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
}
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
llama_grammar_element elem = rule[i];
switch (elem.type) {
case LLAMA_GRETYPE_END:
throw std::runtime_error(
"unexpected end of rule: " + std::to_string(rule_id) + "," +
std::to_string(i));
case LLAMA_GRETYPE_ALT:
fprintf(file, "| ");
break;
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
break;
case LLAMA_GRETYPE_CHAR:
fprintf(file, "[");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_NOT:
fprintf(file, "[^");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
fprintf(file, "-");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ALT:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, ".");
break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ANY:
break;
default:
fprintf(file, "] ");
}
}
}
fprintf(file, "\n");
}
void print_grammar(FILE * file, const parse_state & state) {
try {
std::map<uint32_t, std::string> symbol_id_names;
for (const auto & kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first;
}
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
// fprintf(file, "%zu: ", i);
// print_rule_binary(file, state.rules[i]);
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
// fprintf(file, "\n");
}
} catch (const std::exception & err) {
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
}
}
std::vector<const llama_grammar_element *> parse_state::c_rules() {
std::vector<const llama_grammar_element *> ret;
ret.reserve(rules.size());
for (const auto & rule : rules) {
ret.push_back(rule.data());
}
return ret;
}
}

View file

@ -1,29 +0,0 @@
// Implements a parser for an extended Backus-Naur form (BNF), producing the
// binary context-free grammar format specified by llama.h. Supports character
// ranges, grouping, and repetition operators. As an example, a grammar for
// arithmetic might look like:
//
// root ::= expr
// expr ::= term ([-+*/] term)*
// term ::= num | "(" space expr ")" space
// num ::= [0-9]+ space
// space ::= [ \t\n]*
#pragma once
#include "llama.h"
#include <vector>
#include <map>
#include <cstdint>
#include <string>
namespace grammar_parser {
struct parse_state {
std::map<std::string, uint32_t> symbol_ids;
std::vector<std::vector<llama_grammar_element>> rules;
std::vector<const llama_grammar_element *> c_rules();
};
parse_state parse(const char * src);
void print_grammar(FILE * file, const parse_state & state);
}

401
common/log.cpp Normal file
View file

@ -0,0 +1,401 @@
#include "log.h"
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
#include <mutex>
#include <sstream>
#include <thread>
#include <vector>
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void gpt_log_set_verbosity_thold(int verbosity) {
gpt_log_verbosity_thold = verbosity;
}
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
// colors
enum gpt_log_col : int {
GPT_LOG_COL_DEFAULT = 0,
GPT_LOG_COL_BOLD,
GPT_LOG_COL_RED,
GPT_LOG_COL_GREEN,
GPT_LOG_COL_YELLOW,
GPT_LOG_COL_BLUE,
GPT_LOG_COL_MAGENTA,
GPT_LOG_COL_CYAN,
GPT_LOG_COL_WHITE,
};
// disable colors by default
static std::vector<const char *> g_col = {
"",
"",
"",
"",
"",
"",
"",
"",
"",
};
struct gpt_log_entry {
enum ggml_log_level level;
bool prefix;
int64_t timestamp;
std::vector<char> msg;
// signals the worker thread to stop
bool is_end;
void print(FILE * file = nullptr) const {
FILE * fcur = file;
if (!fcur) {
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
// these messages will still be logged to a file
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
return;
}
fcur = stdout;
if (level != GGML_LOG_LEVEL_NONE) {
fcur = stderr;
}
}
if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
if (timestamp) {
// [M.s.ms.us]
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
g_col[GPT_LOG_COL_BLUE],
(int) (timestamp / 1000000 / 60),
(int) (timestamp / 1000000 % 60),
(int) (timestamp / 1000 % 1000),
(int) (timestamp % 1000),
g_col[GPT_LOG_COL_DEFAULT]);
}
switch (level) {
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
default:
break;
}
}
fprintf(fcur, "%s", msg.data());
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
}
fflush(fcur);
}
};
struct gpt_log {
// default capacity - will be expanded if needed
gpt_log() : gpt_log(256) {}
gpt_log(size_t capacity) {
file = nullptr;
prefix = false;
timestamps = false;
running = false;
t_start = t_us();
// initial message size - will be expanded if longer messages arrive
entries.resize(capacity);
for (auto & entry : entries) {
entry.msg.resize(256);
}
head = 0;
tail = 0;
resume();
}
~gpt_log() {
pause();
if (file) {
fclose(file);
}
}
private:
std::mutex mtx;
std::thread thrd;
std::condition_variable cv;
FILE * file;
bool prefix;
bool timestamps;
bool running;
int64_t t_start;
// ring buffer of entries
std::vector<gpt_log_entry> entries;
size_t head;
size_t tail;
// worker thread copies into this
gpt_log_entry cur;
public:
void add(enum ggml_log_level level, const char * fmt, va_list args) {
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
// discard messages while the worker thread is paused
return;
}
auto & entry = entries[tail];
{
// cannot use args twice, so make a copy in case we need to expand the buffer
va_list args_copy;
va_copy(args_copy, args);
#if 1
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
}
#else
// hack for bolding arguments
std::stringstream ss;
for (int i = 0; fmt[i] != 0; i++) {
if (fmt[i] == '%') {
ss << LOG_COL_BOLD;
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
ss << LOG_COL_DEFAULT;
if (fmt[i] == 0) break;
}
ss << fmt[i];
}
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
}
#endif
}
entry.level = level;
entry.prefix = prefix;
entry.timestamp = 0;
if (timestamps) {
entry.timestamp = t_us() - t_start;
}
entry.is_end = false;
tail = (tail + 1) % entries.size();
if (tail == head) {
// expand the buffer
std::vector<gpt_log_entry> new_entries(2*entries.size());
size_t new_tail = 0;
do {
new_entries[new_tail] = std::move(entries[head]);
head = (head + 1) % entries.size();
new_tail = (new_tail + 1);
} while (head != tail);
head = 0;
tail = new_tail;
for (size_t i = tail; i < new_entries.size(); i++) {
new_entries[i].msg.resize(256);
}
entries = std::move(new_entries);
}
cv.notify_one();
}
void resume() {
std::lock_guard<std::mutex> lock(mtx);
if (running) {
return;
}
running = true;
thrd = std::thread([this]() {
while (true) {
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return head != tail; });
cur = entries[head];
head = (head + 1) % entries.size();
}
if (cur.is_end) {
break;
}
cur.print(); // stdout and stderr
if (file) {
cur.print(file);
}
}
});
}
void pause() {
{
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
return;
}
running = false;
// push an entry to signal the worker thread to stop
{
auto & entry = entries[tail];
entry.is_end = true;
tail = (tail + 1) % entries.size();
}
cv.notify_one();
}
thrd.join();
}
void set_file(const char * path) {
pause();
if (file) {
fclose(file);
}
if (path) {
file = fopen(path, "w");
} else {
file = nullptr;
}
resume();
}
void set_colors(bool colors) {
pause();
if (colors) {
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
} else {
for (size_t i = 0; i < g_col.size(); i++) {
g_col[i] = "";
}
}
resume();
}
void set_prefix(bool prefix) {
std::lock_guard<std::mutex> lock(mtx);
this->prefix = prefix;
}
void set_timestamps(bool timestamps) {
std::lock_guard<std::mutex> lock(mtx);
this->timestamps = timestamps;
}
};
//
// public API
//
struct gpt_log * gpt_log_init() {
return new gpt_log;
}
struct gpt_log * gpt_log_main() {
static struct gpt_log log;
return &log;
}
void gpt_log_pause(struct gpt_log * log) {
log->pause();
}
void gpt_log_resume(struct gpt_log * log) {
log->resume();
}
void gpt_log_free(struct gpt_log * log) {
delete log;
}
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
va_list args;
va_start(args, fmt);
log->add(level, fmt, args);
va_end(args);
}
void gpt_log_set_file(struct gpt_log * log, const char * file) {
log->set_file(file);
}
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
log->set_colors(colors);
}
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
log->set_prefix(prefix);
}
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}

View file

@ -1,724 +1,92 @@
#pragma once #pragma once
#include <chrono> #include "ggml.h" // for ggml_log_level
#include <cstring>
#include <sstream>
#include <iostream>
#include <thread>
#include <vector>
#include <algorithm>
#include <cinttypes>
// -------------------------------- #ifndef __GNUC__
// # define LOG_ATTRIBUTE_FORMAT(...)
// Basic usage: #elif defined(__MINGW32__)
// # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
// --------
//
// The LOG() and LOG_TEE() macros are ready to go by default
// they do not require any initialization.
//
// LOGLN() and LOG_TEELN() are variants which automatically
// include \n character at the end of the log string.
//
// LOG() behaves exactly like printf, by default writing to a logfile.
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
//
// Default logfile is named
// "llama.<threadID>.log"
// Default LOG_TEE() secondary output target is
// stderr
//
// Logs can be dynamically disabled or enabled using functions:
// log_disable()
// and
// log_enable()
//
// A log target can be changed with:
// log_set_target( string )
// creating and opening, or re-opening a file by string filename
// or
// log_set_target( FILE* )
// allowing to point at stderr, stdout, or any valid FILE* file handler.
//
// --------
//
// End of Basic usage.
//
// --------------------------------
// Specifies a log target.
// default uses log_handler() with "llama.log" log file
// this can be changed, by defining LOG_TARGET
// like so:
//
// #define LOG_TARGET (a valid FILE*)
// #include "log.h"
//
// or it can be simply redirected to stdout or stderr
// like so:
//
// #define LOG_TARGET stderr
// #include "log.h"
//
// The log target can also be redirected to a different function
// like so:
//
// #define LOG_TARGET log_handler_different()
// #include "log.h"
//
// FILE* log_handler_different()
// {
// return stderr;
// }
//
// or:
//
// #define LOG_TARGET log_handler_another_one("somelog.log")
// #include "log.h"
//
// FILE* log_handler_another_one(char*filename)
// {
// static FILE* logfile = nullptr;
// (...)
// if( !logfile )
// {
// fopen(...)
// }
// (...)
// return logfile
// }
//
#ifndef LOG_TARGET
#define LOG_TARGET log_handler()
#endif
#ifndef LOG_TEE_TARGET
#define LOG_TEE_TARGET stderr
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid()
{
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
return pid;
}
// Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
{
static bool _multilog = false;
if (multilog != LogTriStateSame)
{
_multilog = multilog == LogTriStateTrue;
}
std::stringstream buf;
buf << log_file_basename;
if (_multilog)
{
buf << ".";
buf << log_get_pid();
}
buf << ".";
buf << log_file_extension;
return buf.str();
}
#ifndef LOG_DEFAULT_FILE_NAME
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
#endif
// Utility for turning #define values into string literals
// so we can have a define for stderr and
// we can print "stderr" instead of literal stderr, etc.
#define LOG_STRINGIZE1(s) #s
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
// Allows disabling timestamps.
// in order to disable, define LOG_NO_TIMESTAMPS
// like so:
//
// #define LOG_NO_TIMESTAMPS
// #include "log.h"
//
#ifndef LOG_NO_TIMESTAMPS
#ifndef _MSC_VER
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else #else
#define LOG_TIMESTAMP_FMT "%s" # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#define LOG_TIMESTAMP_VAL ,""
#endif #endif
#ifdef LOG_TEE_TIMESTAMPS #define LOG_DEFAULT_DEBUG 1
#ifndef _MSC_VER #define LOG_DEFAULT_LLAMA 0
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else
#define LOG_TEE_TIMESTAMP_FMT "%s"
#define LOG_TEE_TIMESTAMP_VAL ,""
#endif
// Allows disabling file/line/function prefix // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION // set via gpt_log_set_verbosity()
// like so: extern int gpt_log_verbosity_thold;
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
// the gpt_log uses an internal worker thread to print/write log messages
// when the worker thread is paused, incoming log messages are discarded
struct gpt_log;
struct gpt_log * gpt_log_init();
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
void gpt_log_free (struct gpt_log * log);
LOG_ATTRIBUTE_FORMAT(3, 4)
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
// //
// #define LOG_NO_FILE_LINE_FUNCTION // regular log output:
// #include "log.h"
// //
#ifndef LOG_NO_FILE_LINE_FUNCTION // ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
#ifndef _MSC_VER // llm_load_tensors: ggml ctx size = 0.27 MiB
#define LOG_FLF_FMT "[%24s:%5d][%24s] " // llm_load_tensors: offloading 32 repeating layers to GPU
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ // llm_load_tensors: offloading non-repeating layers to GPU
#else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_FLF_FMT "%s"
#define LOG_FLF_VAL ,""
#endif
#ifdef LOG_TEE_FILE_LINE_FUNCTION
#ifndef _MSC_VER
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_TEE_FLF_FMT "%s"
#define LOG_TEE_FLF_VAL ,""
#endif
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
// //
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) // with prefix = true, timestamps = true, the log output will look like this:
#define LOG_IMPL(str, ...) \ //
do { \ // 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
if (LOG_TARGET != nullptr) \ // 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
{ \ // 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ // 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
fflush(LOG_TARGET); \ //
} \ // I - info (stdout, V = 0)
// W - warning (stderr, V = 0)
// E - error (stderr, V = 0)
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
//
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
// helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
//
// for example:
//
// LOG_DBG("this is a debug message: %d\n", expensive_function());
//
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
//
#define LOG_TMPL(level, verbosity, ...) \
do { \
if ((verbosity) <= gpt_log_verbosity_thold) { \
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
} \
} while (0) } while (0)
#else
#define LOG_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
} while (0)
#endif
// INTERNAL, DO NOT USE #define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
// USE LOG_TEE() INSTEAD #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
//
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
} while (0)
#else
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
} while (0)
#endif
// The '\0' as a last argument, is a trick to bypass the silly #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro" #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
// so we can have a single macro which can be called just like printf. #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
// Main LOG macro. #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
// behaves like printf, and supports arguments the exact same way. #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
// #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
#if !defined(_MSC_VER) || defined(__clang__) #define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
#define LOG(...) LOG_IMPL(__VA_ARGS__, "") #define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT, verbosity, __VA_ARGS__)
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// Main TEE macro.
// does the same as LOG
// and
// simultaneously writes stderr.
//
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
#if !defined(_MSC_VER) || defined(__clang__)
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
#if !defined(_MSC_VER) || defined(__clang__)
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{
static bool _initialized = false;
static bool _append = false;
static bool _disabled = filename.empty() && target == nullptr;
static std::string log_current_filename{filename};
static FILE *log_current_target{target};
static FILE *logfile = nullptr;
if (change)
{
if (append != LogTriStateSame)
{
_append = append == LogTriStateTrue;
return logfile;
}
if (disable == LogTriStateTrue)
{
// Disable primary target
_disabled = true;
}
// If previously disabled, only enable, and keep previous target
else if (disable == LogTriStateFalse)
{
_disabled = false;
}
// Otherwise, process the arguments
else if (log_current_filename != filename || log_current_target != target)
{
_initialized = false;
}
}
if (_disabled)
{
// Log is disabled
return nullptr;
}
if (_initialized)
{
// with fallback in case something went wrong
return logfile ? logfile : stderr;
}
// do the (re)initialization
if (target != nullptr)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
log_current_filename = LOG_DEFAULT_FILE_NAME;
log_current_target = target;
logfile = target;
}
else
{
if (log_current_filename != filename)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
}
logfile = fopen(filename.c_str(), _append ? "a" : "w");
}
if (!logfile)
{
// Verify whether the file was opened, otherwise fallback to stderr
logfile = stderr;
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
fflush(stderr);
// At this point we let the init flag be to true below, and let the target fallback to stderr
// otherwise we would repeatedly fopen() which was already unsuccessful
}
_initialized = true;
return logfile ? logfile : stderr;
}
// INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{
return log_handler1_impl(change, append, disable, filename, target);
}
// Disables logs entirely at runtime.
// Makes LOG() and LOG_TEE() produce no output,
// until enabled back.
#define log_disable() log_disable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_disable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
}
// Enables logs at runtime.
#define log_enable() log_enable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_enable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
}
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
// INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); }
// Enable or disable creating separate log files for each run.
// can ONLY be invoked BEFORE first log use.
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
// Enable or disable append mode for log file.
// can ONLY be invoked BEFORE first log use.
#define log_append(enable) log_append_impl(enable)
// INTERNAL, DO NOT USE
inline FILE *log_append_impl(bool enable)
{
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
}
inline void log_test()
{
log_disable();
LOG("01 Hello World to nobody, because logs are disabled!\n");
log_enable();
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
log_set_target(stderr);
LOG("04 Hello World to stderr!\n");
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("06 Hello World to default log file!\n");
log_set_target(stdout);
LOG("07 Hello World to stdout!\n");
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("08 Hello World to default log file again!\n");
log_disable();
LOG("09 Hello World _1_ into the void!\n");
log_enable();
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
log_disable();
log_set_target("llama.anotherlog.log");
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
log_enable();
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
log_set_target("llama.yetanotherlog.log");
LOG("13 Hello World this time in yet new file?\n");
log_set_target(log_filename_generator("llama_autonamed", "log"));
LOG("14 Hello World in log with generated filename!\n");
#ifdef _MSC_VER
LOG_TEE("15 Hello msvc TEE without arguments\n");
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
LOG_TEELN("17 Hello msvc TEELN without arguments\n");
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
LOG("19 Hello msvc LOG without arguments\n");
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
LOGLN("21 Hello msvc LOGLN without arguments\n");
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
#endif
}
inline bool log_param_single_parse(const std::string & param)
{
if ( param == "--log-test")
{
log_test();
return true;
}
if ( param == "--log-disable")
{
log_disable();
return true;
}
if ( param == "--log-enable")
{
log_enable();
return true;
}
if (param == "--log-new")
{
log_multilog(true);
return true;
}
if (param == "--log-append")
{
log_append(true);
return true;
}
return false;
}
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
{
if ( param == "--log-file")
{
if (!check_but_dont_parse)
{
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
}
return true;
}
return false;
}
inline void log_print_usage()
{
printf("log options:\n");
/* format
printf(" -h, --help show this help message and exit\n");*/
/* spacing
printf("__-param----------------Description\n");*/
printf(" --log-test Run simple logging test\n");
printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n");
printf(" --log-new Create a separate new log file on start. "
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
printf(" --log-append Don't truncate the old log file.\n");
printf("\n");
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
// INTERNAL, DO NOT USE
inline void log_dump_cmdline_impl(int argc, char **argv)
{
std::stringstream buf;
for (int i = 0; i < argc; ++i)
{
if (std::string(argv[i]).find(' ') != std::string::npos)
{
buf << " \"" << argv[i] <<"\"";
}
else
{
buf << " " << argv[i];
}
}
LOGLN("Cmd:%s", buf.str().c_str());
}
#define log_tostr(var) log_var_to_string_impl(var).c_str()
inline std::string log_var_to_string_impl(bool var)
{
return var ? "true" : "false";
}
inline std::string log_var_to_string_impl(std::string var)
{
return var;
}
inline std::string log_var_to_string_impl(const std::vector<int> & var)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (auto e : var)
{
if (first)
{
first = false;
}
else
{
buf << ", ";
}
buf << std::to_string(e);
}
buf << " ]";
return buf.str();
}
template <typename C, typename T>
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (const auto & token : tokens)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "'" << detokenized << "'"
<< ":" << std::to_string(token);
}
buf << " ]";
return buf.str();
}
template <typename C, typename B>
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (int i = 0; i < batch.n_tokens; ++i)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "\n" << std::to_string(i)
<< ":token '" << detokenized << "'"
<< ":pos " << std::to_string(batch.pos[i])
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
<< ":logits " << std::to_string(batch.logits[i]);
}
buf << " ]";
return buf.str();
}
#ifdef LOG_DISABLE_LOGS
#undef LOG
#define LOG(...) // dummy stub
#undef LOGLN
#define LOGLN(...) // dummy stub
#undef LOG_TEE
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
#undef LOG_TEELN
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
#undef LOG_DISABLE
#define LOG_DISABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_SET_TARGET
#define LOG_SET_TARGET(...) // dummy stub
#undef LOG_DUMP_CMDLINE
#define LOG_DUMP_CMDLINE(...) // dummy stub
#endif // LOG_DISABLE_LOGS

View file

@ -2,8 +2,11 @@
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
#include <cinttypes>
#include <cstdint> #include <cstdint>
#include <cstdio>
#include <fstream> #include <fstream>
#include <thread>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) { std::vector<llama_token> & inp, int nnew, bool print_progress) {

View file

@ -1,460 +1,458 @@
#define LLAMA_API_INTERNAL
#include "sampling.h" #include "sampling.h"
#include <random>
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { #include "common.h"
struct llama_sampling_context * result = new llama_sampling_context();
result->params = params; #include <cmath>
result->grammar = nullptr; #include <unordered_map>
// if there is a grammar, parse it // the ring buffer works similarly to std::deque, but with a fixed capacity
if (!params.grammar.empty()) { // TODO: deduplicate with llama-impl.h
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str()); template<typename T>
struct ring_buffer {
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
// will be empty (default) if there are parse errors T & front() {
if (result->parsed_grammar.rules.empty()) { if (sz == 0) {
fprintf(stderr, "%s: failed to parse grammar\n", __func__); throw std::runtime_error("ring buffer is empty");
delete result; }
return nullptr; return data[first];
}
const T & front() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
T & back() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
const T & back() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
void push_back(const T & value) {
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;
} else {
sz++;
}
data[pos] = value;
pos = (pos + 1) % capacity;
}
T pop_front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
T value = data[first];
first = (first + 1) % capacity;
sz--;
return value;
}
const T & rat(size_t i) const {
if (i >= sz) {
throw std::runtime_error("ring buffer: index out of bounds");
}
return data[(first + sz - i - 1) % capacity];
}
std::vector<T> to_vector() const {
std::vector<T> result;
result.reserve(sz);
for (size_t i = 0; i < sz; i++) {
result.push_back(data[(first + i) % capacity]);
}
return result;
}
void clear() {
// here only reset the status of the buffer
sz = 0;
first = 0;
pos = 0;
}
bool empty() const {
return sz == 0;
}
size_t size() const {
return sz;
}
size_t capacity = 0;
size_t sz = 0;
size_t first = 0;
size_t pos = 0;
std::vector<T> data;
};
struct gpt_sampler {
gpt_sampler_params params;
struct llama_sampler * grmr;
struct llama_sampler * chain;
ring_buffer<llama_token> prev;
std::vector<llama_token_data> cur;
llama_token_data_array cur_p;
void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
} }
// Ensure that there is a "root" node. cur_p = { cur.data(), cur.size(), -1, false };
if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
delete result;
return nullptr;
}
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
struct llama_grammar * grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
result->grammar = grammar;
} }
};
result->prev.resize(params.n_prev); std::string gpt_sampler_params::print() const {
result->n_valid = 0;
llama_sampling_set_rng_seed(result, params.seed);
return result;
}
void llama_sampling_free(struct llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
}
delete ctx;
}
void llama_sampling_reset(llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
ctx->grammar = NULL;
}
if (!ctx->parsed_grammar.rules.empty()) {
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
struct llama_grammar * grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
ctx->grammar = grammar;
}
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear();
ctx->n_valid = 0;
}
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = std::random_device{}();
}
ctx->rng.seed(seed);
}
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
if (dst->grammar) {
llama_grammar_free(dst->grammar);
dst->grammar = nullptr;
}
if (src->grammar) {
dst->grammar = llama_grammar_copy(src->grammar);
}
dst->prev = src->prev;
}
llama_token llama_sampling_last(llama_sampling_context * ctx) {
return ctx->prev.back();
}
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
const int size = ctx_sampling->prev.size();
n = std::min(n, size);
std::string result;
for (int i = size - n; i < size; i++) {
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
}
return result;
}
std::string llama_sampling_print(const llama_sampling_params & params) {
char result[1024]; char result[1024];
snprintf(result, sizeof(result), snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp, top_k, tfs_z, top_p, min_p, typ_p, temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau); mirostat, mirostat_eta, mirostat_tau);
return std::string(result); return std::string(result);
} }
std::string llama_sampling_order_print(const llama_sampling_params & params) { struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
std::string result = "CFG -> Penalties "; llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
if (params.mirostat == 0) {
for (auto sampler_type : params.samplers_sequence) { lparams.no_perf = params.no_perf;
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
if (!sampler_type_name.empty()) { auto * result = new gpt_sampler {
result += "-> " + sampler_type_name + " "; /* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
/* .cur_p = */ {},
};
llama_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias(
llama_n_vocab(model),
params.logit_bias.size(),
params.logit_bias.data()));
llama_sampler_chain_add(result->chain,
llama_sampler_init_penalties(
llama_n_vocab (model),
llama_token_eos(model),
llama_token_nl (model),
params.penalty_last_n,
params.penalty_repeat,
params.penalty_freq,
params.penalty_present,
params.penalize_nl,
params.ignore_eos));
if (params.temp > 0.0f) {
if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case GPT_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TFS_Z:
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
} }
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
} else {
GGML_ASSERT(false && "unknown mirostat version");
} }
} else { } else {
result += "-> mirostat "; if (params.n_probs > 0) {
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
//
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
// it is much faster, since we avoid sorting all tokens and should give a good approximation
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
}
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
} }
return result; return result;
} }
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) { void gpt_sampler_free(struct gpt_sampler * gsmpl) {
switch (sampler_type) { if (gsmpl) {
case llama_sampler_type::TOP_K: return "top_k"; llama_sampler_free(gsmpl->grmr);
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p"; llama_sampler_free(gsmpl->chain);
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p"; delete gsmpl;
case llama_sampler_type::TEMPERATURE: return "temperature"; }
}
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
llama_sampler_accept(gsmpl->chain, token);
gsmpl->prev.push_back(token);
}
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
llama_sampler_reset(gsmpl->grmr);
llama_sampler_reset(gsmpl->chain);
}
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
return new gpt_sampler {
/* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .chain = */ llama_sampler_clone(gsmpl->chain),
/* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur,
/* .cur_p = */ gsmpl->cur_p,
};
}
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
// TODO: measure grammar performance
if (gsmpl) {
llama_perf_sampler_print(gsmpl->chain);
}
if (ctx) {
llama_perf_context_print(ctx);
}
}
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
gsmpl->set_logits(ctx, idx);
auto & grmr = gsmpl->grmr;
auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
if (grammar_first) {
llama_sampler_apply(grmr, &cur_p);
}
llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
const llama_token id = cur_p.data[cur_p.selected].id;
if (grammar_first) {
return id;
}
// check if it the sampled token fits the grammar
{
llama_token_data single_token_data = { id, 1.0f, 0.0f };
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
llama_sampler_apply(grmr, &single_token_data_array);
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
if (is_valid) {
return id;
}
}
// resampling:
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p);
llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
return cur_p.data[cur_p.selected].id;
}
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
return llama_sampler_get_seed(gsmpl->chain);
}
// helpers
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
return &gsmpl->cur_p;
}
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
return gsmpl->prev.rat(0);
}
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
std::string result = "logits ";
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
}
return result;
}
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
n = std::min(n, (int) gsmpl->prev.size());
if (n <= 0) {
return "";
}
std::string result;
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
for (int i = n - 1; i >= 0; i--) {
const llama_token id = gsmpl->prev.rat(i);
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
result += llama_token_to_piece(ctx_main, id);
}
return result;
}
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
default : return '?';
}
}
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
default : return ""; default : return "";
} }
} }
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) { std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map { std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K}, { "top_k", GPT_SAMPLER_TYPE_TOP_K },
{"top_p", llama_sampler_type::TOP_P}, { "top_p", GPT_SAMPLER_TYPE_TOP_P },
{"typical_p", llama_sampler_type::TYPICAL_P}, { "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
{"min_p", llama_sampler_type::MIN_P}, { "min_p", GPT_SAMPLER_TYPE_MIN_P },
{"tfs_z", llama_sampler_type::TFS_Z}, { "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
{"temperature", llama_sampler_type::TEMPERATURE} { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
}; };
// since samplers names are written multiple ways // since samplers names are written multiple ways
// make it ready for both system names and input names // make it ready for both system names and input names
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map { std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
{"top-k", llama_sampler_type::TOP_K}, { "top-k", GPT_SAMPLER_TYPE_TOP_K },
{"top-p", llama_sampler_type::TOP_P}, { "top-p", GPT_SAMPLER_TYPE_TOP_P },
{"nucleus", llama_sampler_type::TOP_P}, { "nucleus", GPT_SAMPLER_TYPE_TOP_P },
{"typical-p", llama_sampler_type::TYPICAL_P}, { "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
{"typical", llama_sampler_type::TYPICAL_P}, { "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
{"min-p", llama_sampler_type::MIN_P}, { "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
{"tfs-z", llama_sampler_type::TFS_Z}, { "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
{"tfs", llama_sampler_type::TFS_Z}, { "min-p", GPT_SAMPLER_TYPE_MIN_P },
{"temp", llama_sampler_type::TEMPERATURE} { "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
}; };
std::vector<llama_sampler_type> sampler_types; std::vector<gpt_sampler_type> samplers;
sampler_types.reserve(names.size()); samplers.reserve(names.size());
for (const auto & name : names)
{
auto sampler_item = sampler_canonical_name_map.find(name);
if (sampler_item != sampler_canonical_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
else
{
if (allow_alt_names)
{
sampler_item = sampler_alt_name_map.find(name);
if (sampler_item != sampler_alt_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
}
}
}
return sampler_types;
}
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) { for (const auto & name : names) {
std::unordered_map<char, llama_sampler_type> sampler_name_map { auto sampler = sampler_canonical_name_map.find(name);
{'k', llama_sampler_type::TOP_K}, if (sampler != sampler_canonical_name_map.end()) {
{'p', llama_sampler_type::TOP_P}, samplers.push_back(sampler->second);
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names_string.size());
for (const auto & c : names_string) {
const auto sampler_item = sampler_name_map.find(c);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
}
}
return sampler_types;
}
// no reasons to expose this function in header
static void sampler_queue(
struct llama_context * ctx_main,
const llama_sampling_params & params,
llama_token_data_array & cur_p,
size_t min_keep) {
const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent;
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
for (auto sampler_type : samplers_sequence) {
switch (sampler_type) {
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case llama_sampler_type::TEMPERATURE:
if (dynatemp_range > 0) {
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
} else {
llama_sample_temp(ctx_main, &cur_p, temp);
}
break;
default : break;
}
}
}
static llama_token llama_sampling_sample_impl(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool is_resampling) {
const llama_sampling_params & params = ctx_sampling->params;
const float temp = params.temp;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
std::vector<float> original_logits;
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
if (ctx_sampling->grammar != NULL && !is_resampling) {
GGML_ASSERT(!original_logits.empty());
}
llama_token id = 0;
if (temp < 0.0) {
// greedy sampling, with probs
llama_sample_softmax(ctx_main, &cur_p);
id = cur_p.data[0].id;
} else if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx_main, &cur_p);
} else {
if (mirostat == 1) {
const int mirostat_m = 100;
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
} else if (mirostat == 2) {
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else { } else {
// temperature sampling if (allow_alt_names) {
size_t min_keep = std::max(1, params.min_keep); sampler = sampler_alt_name_map.find(name);
if (sampler != sampler_alt_name_map.end()) {
sampler_queue(ctx_main, params, cur_p, min_keep); samplers.push_back(sampler->second);
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
//{
// const int n_top = 10;
// LOG("top %d candidates:\n", n_top);
// for (int i = 0; i < n_top; i++) {
// const llama_token id = cur_p.data[i].id;
// (void)id; // To avoid a warning that id is unused when logging is disabled.
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
// }
//}
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
}
}
if (ctx_sampling->grammar != NULL && !is_resampling) {
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
// Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
// If the token is not valid according to the grammar, perform resampling
if (!is_valid) {
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
// Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
}
}
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
return id;
}
static llama_token_data_array llama_sampling_prepare_impl(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool apply_grammar,
std::vector<float> * original_logits) {
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;
const bool penalize_nl = params.penalize_nl;
auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + n_vocab};
}
// apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
if (ctx_cfg) {
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
// apply penalties
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;
} }
} }
} }
} }
// apply grammar checks before sampling logic return samplers;
if (apply_grammar && ctx_sampling->grammar != NULL) { }
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
};
std::vector<gpt_sampler_type> samplers;
samplers.reserve(chars.size());
for (const auto & c : chars) {
const auto sampler = sampler_name_map.find(c);
if (sampler != sampler_name_map.end()) {
samplers.push_back(sampler->second);
}
} }
return cur_p; return samplers;
}
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
// Call the implementation function with is_resampling set to false by default
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
}
llama_token_data_array llama_sampling_prepare(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool apply_grammar,
std::vector<float> * original_logits) {
return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
}
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar) {
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
}
} }

View file

@ -2,159 +2,82 @@
#include "llama.h" #include "llama.h"
#include "grammar-parser.h"
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
// sampler types
enum class llama_sampler_type : char {
TOP_K = 'k',
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
TYPICAL_P = 'y',
TEMPERATURE = 't'
};
// sampling parameters
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
llama_sampler_type::TFS_Z,
llama_sampler_type::TYPICAL_P,
llama_sampler_type::TOP_P,
llama_sampler_type::MIN_P,
llama_sampler_type::TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // how strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
std::vector<llama_token> penalty_prompt_tokens;
bool use_penalty_prompt_tokens = false;
} llama_sampling_params;
// general sampler context
// TODO: move to llama.h
struct llama_sampling_context {
// parameters that will be used for sampling
llama_sampling_params params;
// mirostat sampler state
float mirostat_mu;
llama_grammar * grammar;
// internal
grammar_parser::parse_state parsed_grammar;
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;
size_t n_valid; // Number of correct top tokens with correct probabilities.
std::mt19937 rng;
};
#include "common.h" #include "common.h"
// Create a new sampling context instance. #include <string>
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params); #include <vector>
void llama_sampling_free(struct llama_sampling_context * ctx); // gpt_sampler extends llama_sampler with additional functionality:
// Reset the sampler context
// - clear prev tokens
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);
// Set the sampler seed
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
// Get the last sampled token
llama_token llama_sampling_last(llama_sampling_context * ctx);
// Get a string representation of the last sampled tokens
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
// Print sampling parameters into a string
std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params);
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
// llama_sampling_reset when a sequence ends
// //
// required: // - grammar support
// - ctx_main: context to use for sampling // - custom sampler logic based on the parameters
// - ctx_sampling: sampling-specific context // - history of the last accepted tokens
// - performance metrics
// //
// optional: // This goal is to have a common implementation of the sampling logic shared across the examples.
// - ctx_cfg: context to use for classifier-free guidance // For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
// - idx: sample from llama_get_logits_ith(ctx, idx) // complex (top-k, top-p, etc).
// //
// returns: // Another example is related to the grammar. In general, the grammar constraints applied on the full
// - token: sampled token // vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
// - candidates: vector of candidate tokens // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
// grammar constraints are applied to the full vocabulary and the token is resampled.
//
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
// be moved into the core llama library.
//
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
// This can be used to access the probabilities of the rest of the non-sampled tokens.
//
// TODO: measure grammar performance
// //
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = -1);
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters. struct gpt_sampler;
llama_token_data_array llama_sampling_prepare(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0,
bool apply_grammar = true,
std::vector<float> * original_logits = nullptr);
void llama_sampling_accept( // llama_sampler API overloads
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main, struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
llama_token id,
bool apply_grammar); void gpt_sampler_free(struct gpt_sampler * gsmpl);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
void gpt_sampler_reset (struct gpt_sampler * gsmpl);
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
// arguments can be nullptr to skip printing
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
// extended sampling implementation:
//
// - set logits
// - apply the configured sampler chain
// - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
//
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
// helpers
// access the internal list of current candidate tokens
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
// get the last accepted token
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
// print the sampler chain into a string
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
// get a string representation of the last accepted tokens
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,11 @@
#include "train.h" #include "train.h"
#include "common.h" #include "common.h"
#include <algorithm>
#include <random> #include <random>
#include <sstream> #include <sstream>
#include <functional> #include <functional>
#include <cstring>
struct random_normal_distribution { struct random_normal_distribution {
std::mt19937 gen; std::mt19937 gen;

View file

@ -3,6 +3,7 @@
from __future__ import annotations from __future__ import annotations
import ast
import logging import logging
import argparse import argparse
import contextlib import contextlib
@ -63,6 +64,7 @@ class Model:
model_name: str | None model_name: str | None
metadata_override: Path | None metadata_override: Path | None
dir_model_card: Path dir_model_card: Path
is_lora: bool
# subclasses should define this! # subclasses should define this!
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
@ -70,7 +72,7 @@ class Model:
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False, use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None, metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
if type(self) is Model: if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
@ -92,6 +94,7 @@ class Model:
self.metadata_override = metadata_override self.metadata_override = metadata_override
self.model_name = model_name self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
if self.ftype == gguf.LlamaFileType.GUESSED: if self.ftype == gguf.LlamaFileType.GUESSED:
@ -129,12 +132,14 @@ class Model:
def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_names_from_parts: set[str] = set() tensor_names_from_parts: set[str] = set()
if len(self.part_names) > 1: index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
index_name += ".index.json"
index_file = self.dir_model / index_name
if index_file.is_file():
self.tensor_names = set() self.tensor_names = set()
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
index_name += ".index.json"
logger.info(f"gguf: loading model weight map from '{index_name}'") logger.info(f"gguf: loading model weight map from '{index_name}'")
with open(self.dir_model / index_name, "r", encoding="utf-8") as f: with open(index_file, "r", encoding="utf-8") as f:
index: dict[str, Any] = json.load(f) index: dict[str, Any] = json.load(f)
weight_map = index.get("weight_map") weight_map = index.get("weight_map")
if weight_map is None or not isinstance(weight_map, dict): if weight_map is None or not isinstance(weight_map, dict):
@ -142,6 +147,7 @@ class Model:
self.tensor_names.update(weight_map.keys()) self.tensor_names.update(weight_map.keys())
else: else:
self.tensor_names = tensor_names_from_parts self.tensor_names = tensor_names_from_parts
weight_map = {}
for part_name in self.part_names: for part_name in self.part_names:
logger.info(f"gguf: loading model part '{part_name}'") logger.info(f"gguf: loading model part '{part_name}'")
@ -168,9 +174,17 @@ class Model:
data = LazyTorchTensor.from_eager(data) data = LazyTorchTensor.from_eager(data)
yield name, data yield name, data
# only verify tensor name presence; it doesn't matter if they are not in the right files # verify tensor name presence and identify potentially missing files
if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
if len(extra) == 0 and len(missing_files) > 0:
raise ValueError(f"Missing or incomplete model files: {missing_files}")
else:
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
f"Missing tensors: {missing}\n"
f"Extra tensors: {extra}")
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
if key not in gguf.MODEL_TENSORS[self.model_arch]: if key not in gguf.MODEL_TENSORS[self.model_arch]:
@ -295,12 +309,32 @@ class Model:
gguf.MODEL_TENSOR.FFN_GATE_INP, gguf.MODEL_TENSOR.FFN_GATE_INP,
gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
gguf.MODEL_TENSOR.TIME_MIX_W1,
gguf.MODEL_TENSOR.TIME_MIX_W2,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
) )
) )
or not name.endswith(".weight") or not new_name.endswith(".weight")
): ):
data_qtype = gguf.GGMLQuantizationType.F32 data_qtype = gguf.GGMLQuantizationType.F32
if data_qtype is False and any(
self.match_model_tensor_name(new_name, key, bid)
for key in (
gguf.MODEL_TENSOR.TOKEN_EMBD,
gguf.MODEL_TENSOR.OUTPUT,
)
):
if self.ftype in (
gguf.LlamaFileType.MOSTLY_TQ1_0,
gguf.LlamaFileType.MOSTLY_TQ2_0,
):
# TODO: use Q4_K and Q6_K
data_qtype = gguf.GGMLQuantizationType.F16
# No override (data_qtype is False), or wants to be quantized (data_qtype is True) # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
if isinstance(data_qtype, bool): if isinstance(data_qtype, bool):
if self.ftype == gguf.LlamaFileType.ALL_F32: if self.ftype == gguf.LlamaFileType.ALL_F32:
@ -311,6 +345,10 @@ class Model:
data_qtype = gguf.GGMLQuantizationType.BF16 data_qtype = gguf.GGMLQuantizationType.BF16
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
data_qtype = gguf.GGMLQuantizationType.Q8_0 data_qtype = gguf.GGMLQuantizationType.Q8_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
data_qtype = gguf.GGMLQuantizationType.TQ1_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
data_qtype = gguf.GGMLQuantizationType.TQ2_0
else: else:
raise ValueError(f"Unknown file type: {self.ftype.name}") raise ValueError(f"Unknown file type: {self.ftype.name}")
@ -590,6 +628,18 @@ class Model:
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
res = "smollm" res = "smollm"
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
# ref: https://huggingface.co/bigscience/bloom
res = "bloom"
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
res = "gpt3-finnish"
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
res = "exaone"
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
# ref: https://huggingface.co/microsoft/phi-2
res = "phi-2"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -893,7 +943,7 @@ class GPTNeoXModel(Model):
return tensors return tensors
@Model.register("BloomForCausalLM") @Model.register("BloomForCausalLM", "BloomModel")
class BloomModel(Model): class BloomModel(Model):
model_arch = gguf.MODEL_ARCH.BLOOM model_arch = gguf.MODEL_ARCH.BLOOM
@ -1448,7 +1498,7 @@ class StableLMModel(Model):
raise ValueError(f"Unprocessed norms: {norms}") raise ValueError(f"Unprocessed norms: {norms}")
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
class LlamaModel(Model): class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA model_arch = gguf.MODEL_ARCH.LLAMA
@ -1560,7 +1610,7 @@ class LlamaModel(Model):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3": if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0) base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
factor = rope_scaling.get("factor", 8.0) factor = rope_scaling.get("factor", 8.0)
@ -1583,7 +1633,8 @@ class LlamaModel(Model):
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth)) rope_factors.append(1 / ((1 - smooth) / factor + smooth))
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) if not self.is_lora:
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
super().prepare_tensors() super().prepare_tensors()
@ -1606,15 +1657,16 @@ class BitnetModel(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(1.0) self.gguf_writer.add_rope_scaling_factor(1.0)
def weight_quant(self, weight): def weight_quant(self, weight: Tensor) -> Tensor:
dtype = weight.dtype dtype = weight.dtype
weight = weight.float() weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5) scale = weight.abs().mean().clamp(min=1e-5)
weight = (weight * s).round().clamp(-1, 1) / s iscale = 1 / scale
scale = weight.abs().max().unsqueeze(0) # TODO: multiply by the scale directly instead of inverting it twice
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype) # (this is also unnecessarily doubly inverted upstream)
weight = torch.sign(weight).type(dtype) # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
return weight.type(dtype), scale.type(torch.float32) result = (weight * iscale).round().clamp(-1, 1) / iscale
return result.type(dtype)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name) new_name = self.map_tensor_name(name)
@ -1629,11 +1681,9 @@ class BitnetModel(Model):
gguf.MODEL_TENSOR.FFN_GATE, gguf.MODEL_TENSOR.FFN_GATE,
]): ]):
# transform weight into 1/0/-1 (in fp32) # transform weight into 1/0/-1 (in fp32)
weight_torch, scale_torch = self.weight_quant(data_torch) data_torch = self.weight_quant(data_torch)
yield (new_name, weight_torch)
yield (new_name.removesuffix(".weight") + ".scale", scale_torch) yield (new_name, data_torch)
else:
yield (new_name, data_torch)
@Model.register("GrokForCausalLM") @Model.register("GrokForCausalLM")
@ -1802,6 +1852,60 @@ class MiniCPMModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("MiniCPM3ForCausalLM")
class MiniCPM3Model(Model):
model_arch = gguf.MODEL_ARCH.MINICPM3
def set_gguf_parameters(self):
hparams = self.hparams
rope_dims = hparams["qk_rope_head_dim"]
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is None:
return
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
def set_vocab(self):
self._set_vocab_llama_hf()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape)
)
@Model.register("QWenLMHeadModel") @Model.register("QWenLMHeadModel")
class QwenModel(Model): class QwenModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN model_arch = gguf.MODEL_ARCH.QWEN
@ -2130,8 +2234,9 @@ class Phi3MiniModel(Model):
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) if not self.is_lora:
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
@Model.register("PlamoForCausalLM") @Model.register("PlamoForCausalLM")
@ -2702,7 +2807,87 @@ class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2 model_arch = gguf.MODEL_ARCH.STARCODER2
@Model.register("MambaForCausalLM", "MambaLMHeadModel") @Model.register("Rwkv6ForCausalLM")
class Rwkv6Model(Model):
model_arch = gguf.MODEL_ARCH.RWKV6
def set_vocab(self):
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
vocab_size = self.hparams.get("vocab_size", 65536)
tokens: list[bytes] = ['<s>'.encode("utf-8")]
toktypes: list[int] = [gguf.TokenType.CONTROL]
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
parts = line.split(' ')
assert len(parts) >= 3
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
token = token.encode("utf-8") if isinstance(token, str) else token
assert isinstance(token, bytes)
assert len(token) == token_len
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
tokens.append(token_text.encode("utf-8"))
toktypes.append(gguf.TokenType.NORMAL)
remainder = vocab_size - len(tokens)
assert remainder >= 0
for i in range(len(tokens), vocab_size):
tokens.append(f"[PAD{i}]".encode("utf-8"))
toktypes.append(gguf.TokenType.UNUSED)
self.gguf_writer.add_tokenizer_model("rwkv")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
head_size = self.hparams["head_size"]
hidden_size = self.hparams["hidden_size"]
layer_norm_eps = self.hparams["layer_norm_epsilon"]
rescale_every_n_layers = self.hparams["rescale_every"]
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
# RWKV isn't context limited
self.gguf_writer.add_context_length(1048576)
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
self.gguf_writer.add_wkv_head_size(head_size)
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype)
# required by llama.cpp, unused
self.gguf_writer.add_head_count(0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
new_name += ".weight"
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
data_torch = data_torch.transpose(0, 1)
if new_name.endswith("time_mix_w2.weight"):
data_torch = data_torch.permute(0, 2, 1)
rescale_every_n_layers = self.hparams["rescale_every"]
if rescale_every_n_layers > 0:
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
yield (new_name, data_torch)
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(Model): class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA model_arch = gguf.MODEL_ARCH.MAMBA
@ -2733,7 +2918,10 @@ class MambaModel(Model):
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
use_dt_b_c_norm = False
# For falconmamba we do apply RMS norm on B / DT and C layers
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
use_dt_b_c_norm = True
# Fail early for models which don't have a block expansion factor of 2 # Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model assert d_inner == 2 * d_model
@ -2741,12 +2929,13 @@ class MambaModel(Model):
self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_inner_size(d_inner)
self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_ssm_time_step_rank(dt_rank)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
_tok_embd = None _tok_embd = None
@ -2773,23 +2962,6 @@ class MambaModel(Model):
return [(new_name, data_torch)] return [(new_name, data_torch)]
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
if bid is not None and new_name in (
self.format_tensor_name(
n, bid, ".weight" if name.endswith(".weight") else ""
)
for n in [
gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.SSM_X,
gguf.MODEL_TENSOR.SSM_DT,
gguf.MODEL_TENSOR.SSM_A,
gguf.MODEL_TENSOR.SSM_D,
]
):
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
@Model.register("CohereForCausalLM") @Model.register("CohereForCausalLM")
class CommandR2Model(Model): class CommandR2Model(Model):
@ -2837,6 +3009,66 @@ class OlmoModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("OlmoeForCausalLM")
class OlmoeModel(Model):
model_arch = gguf.MODEL_ARCH.OLMOE
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
_experts: list[dict[str, Tensor]] | None = None
# Copied from: Qwen2MoeModel
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
# Copied from: Qwen2MoeModel
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("JinaBertModel", "JinaBertForMaskedLM") @Model.register("JinaBertModel", "JinaBertForMaskedLM")
class JinaBertV2Model(BertModel): class JinaBertV2Model(BertModel):
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
@ -3734,6 +3966,178 @@ class ChatGLMModel(Model):
name = name.removeprefix("transformer.") name = name.removeprefix("transformer.")
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("NemotronForCausalLM")
class NemotronModel(Model):
model_arch = gguf.MODEL_ARCH.NEMOTRON
def set_vocab(self):
self._set_vocab_sentencepiece()
self.gguf_writer.add_pad_token_id(0)
self.gguf_writer.add_unk_token_id(1)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
# * Partial RoPE
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
# * RopeScaling for Nemotron
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
else:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
# model.layers.{l}.input_layernorm.weight
# model.layers.{l}.post_attention_layernorm.weight
# model.norm.weight
if name.endswith("norm.weight"):
data_torch = data_torch + 1
return [(self.map_tensor_name(name), data_torch)]
@Model.register("ExaoneForCausalLM")
class ExaoneModel(Model):
model_arch = gguf.MODEL_ARCH.EXAONE
def set_gguf_parameters(self):
hparams = self.hparams
assert (hparams["activation_function"] == "silu")
max_position_embeddings = hparams["max_position_embeddings"]
embed_dim = hparams["hidden_size"]
num_heads = hparams["num_attention_heads"]
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
layer_norm_eps = hparams["layer_norm_epsilon"]
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
num_layers = hparams["num_layers"]
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
# attention_dropout_rate = hparams["attention_dropout"]
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
# embed_dropout_rate = hparams["embed_dropout"]
self.gguf_writer.add_embedding_length(embed_dim)
self.gguf_writer.add_head_count(num_heads)
self.gguf_writer.add_head_count_kv(num_kv_heads)
self.gguf_writer.add_context_length(max_position_embeddings)
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_block_count(num_layers)
self.gguf_writer.add_file_type(self.ftype)
if (rope_theta := self.hparams.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta)
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
if hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
def prepare_tensors(self):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
factor = rope_scaling.get("factor", 8.0)
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
assert low_freq_wavelen != high_freq_wavelen
rope_factors = []
for freq in freqs:
wavelen = 2 * math.pi / freq
if wavelen < high_freq_wavelen:
rope_factors.append(1)
elif wavelen > low_freq_wavelen:
rope_factors.append(factor)
else:
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
if not self.is_lora:
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
super().prepare_tensors()
@Model.register("GraniteForCausalLM")
class GraniteModel(LlamaModel):
"""Conversion for IBM's GraniteForCausalLM"""
model_arch = gguf.MODEL_ARCH.GRANITE
def set_gguf_parameters(self):
"""Granite uses standard llama parameters with the following differences:
- No head_dim support
- New multiplier params:
- attention_scale
- embedding_scale
- residual_scale
- logits_scaling
"""
if head_dim := self.hparams.pop("head_dim", None):
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
super().set_gguf_parameters()
# NOTE: Convert _multiplier params to _scale params for naming
# consistency
if attention_scale := self.hparams.get("attention_multiplier"):
self.gguf_writer.add_attention_scale(attention_scale)
logger.info("gguf: (granite) attention_scale = %s", attention_scale)
if embedding_scale := self.hparams.get("embedding_multiplier"):
self.gguf_writer.add_embedding_scale(embedding_scale)
logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
if residual_scale := self.hparams.get("residual_multiplier"):
self.gguf_writer.add_residual_scale(residual_scale)
logger.info("gguf: (granite) residual_scale = %s", residual_scale)
if logits_scale := self.hparams.get("logits_scaling"):
self.gguf_writer.add_logit_scale(logits_scale)
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
@Model.register("GraniteMoeForCausalLM")
class GraniteMoeModel(GraniteModel):
"""Conversion for IBM's GraniteMoeForCausalLM"""
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
is used. This essentially merges w1 and w3 into a single tensor with 2x
the hidden size that is then split during forward. To keep compatibility
with existing mixtral support, we pull them apart here.
"""
if name.endswith("block_sparse_moe.input_linear.weight"):
ffn_dim = self.hparams["intermediate_size"]
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
return [
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
]
return super().modify_tensors(data_torch, name, bid)
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######
@ -3815,8 +4219,8 @@ def parse_args() -> argparse.Namespace:
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
) )
parser.add_argument( parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
) )
parser.add_argument( parser.add_argument(
"--bigendian", action="store_true", "--bigendian", action="store_true",
@ -3903,6 +4307,8 @@ def main() -> None:
"f16": gguf.LlamaFileType.MOSTLY_F16, "f16": gguf.LlamaFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16, "bf16": gguf.LlamaFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
"auto": gguf.LlamaFileType.GUESSED, "auto": gguf.LlamaFileType.GUESSED,
} }

View file

@ -31,6 +31,7 @@ import re
import requests import requests
import sys import sys
import json import json
import shutil
from hashlib import sha256 from hashlib import sha256
from enum import IntEnum, auto from enum import IntEnum, auto
@ -94,6 +95,10 @@ models = [
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
] ]
@ -122,12 +127,27 @@ def download_model(model):
if tokt == TOKENIZER_TYPE.UGM: if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model") files.append("spiece.model")
for file in files: if os.path.isdir(repo):
save_path = f"models/tokenizers/{name}/{file}" # If repo is a path on the file system, copy the directory
if os.path.isfile(save_path): for file in files:
logger.info(f"{name}: File {save_path} already exists - skipping") src_path = os.path.join(repo, file)
continue dst_path = f"models/tokenizers/{name}/{file}"
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) if os.path.isfile(dst_path):
logger.info(f"{name}: File {dst_path} already exists - skipping")
continue
if os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
logger.info(f"{name}: Copied {src_path} to {dst_path}")
else:
logger.warning(f"{name}: Source file {src_path} does not exist")
else:
# If repo is a URL, download the files
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
for model in models: for model in models:

View file

@ -116,7 +116,7 @@ class Tensor:
assert quant is not None, 'Unknown tensor type' assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant (blksize, tysize) = quant
offset += 12 offset += 12
self.dtype= dtype self.dtype= gguf.GGMLQuantizationType(dtype)
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)]) self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len]) self.name = bytes(data[offset:offset + name_len])

View file

@ -363,7 +363,13 @@ if __name__ == '__main__':
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
dest = super().modify_tensors(data_torch, name, bid) dest = list(super().modify_tensors(data_torch, name, bid))
# some archs may have the same tensor for lm_head and output (tie word embeddings)
# in this case, adapters targeting lm_head will fail when using llama-export-lora
# therefore, we ignore them for now
# see: https://github.com/ggerganov/llama.cpp/issues/9065
if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()
@ -386,6 +392,7 @@ if __name__ == '__main__':
dry_run=args.dry_run, dry_run=args.dry_run,
dir_lora_model=dir_lora, dir_lora_model=dir_lora,
lora_alpha=alpha, lora_alpha=alpha,
is_lora=True,
) )
logger.info("Exporting model...") logger.info("Exporting model...")

259
docs/backend/CANN.md Normal file
View file

@ -0,0 +1,259 @@
# llama.cpp for CANN
- [Background](#background)
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
- [Model Supports](#model-supports)
- [DataType Supports](#datatype-supports)
- [Docker](#docker)
- [Linux](#linux)
- [TODO](#todo)
## Background
**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
**Llama.cpp + CANN**
The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
## News
- 2024.8
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
- Create CANN backend for Ascend NPU.
## OS
| OS | Status | Verified |
|:-------:|:-------:|:----------------------------------------------:|
| Linux | Support | Ubuntu 22.04, OpenEuler22.03 |
## Hardware
### Ascend NPU
**Verified devices**
| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
*Notes:*
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the upper table.
## Model Supports
| Model Name | FP16 | Q8_0 | Q4_0 |
|:----------------------------|:-----:|:----:|:----:|
| AquilaChat2-7B | √ | √ | √ |
| Baichuan-7b | √ | √ | √ |
| Baichuan2-7B-Chat | √ | √ | √ |
| bitnet_b1_58-large | √ | √ | √ |
| bloom-560m | √ | x | √ |
| bloomz-alpaca-560m | √ | x | √ |
| c4ai-command-r-35B-v01 | x | x | x |
| chatglm3-6B | x | x | x |
| chinese-alpaca-2-1.3b | √ | √ | √ |
| CodeShell-7B | √ | √ | √ |
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
| deepseek-coder-6.7B-instruct | x | x | x |
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
| falcon-7b-instruct | √ | √ | √ |
| flan-t5-large | √ | √ | √ |
| gemma-2-9b-it | √ | √ | √ |
| glm-4-9B | x | x | x |
| gpt2 | √ | √ | √ |
| Gpt2-163M | √ | √ | √ |
| granite-3B-code-instruct | √ | √ | √ |
| GritLM-7B | √ | √ | √ |
| internlm2_5-7b-chat | √ | √ | √ |
| koala-7B-HF | √ | √ | √ |
| Llama-2-7b-chat-hf | √ | √ | √ |
| Llama-3-Smaug-8B | √ | √ | √ |
| Llama2-Chinese-7b-Chat | √ | √ | √ |
| Llama3-8B | √ | √ | √ |
| Llama3-8b-chinese | √ | √ | √ |
| mamba-130m-hf | √ | √ | √ |
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
| mpt-7B | √ | √ | √ |
| OLMo-1B-hf | √ | √ | √ |
| OpenELM-3B-Instruct | √ | √ | √ |
| Orion-14b-base | √ | √ | √ |
| phi1 | x | x | x |
| phi2 | x | x | x |
| Phi-3-mini-4k-instruct | √ | √ | √ |
| plamo-13b | √ | √ | √ |
| pythia-70M | x | x | x |
| Qwen-7B | √ | √ | √ |
| Qwen2-1.5B-Instruct | √ | x | √ |
| Refact-1_6B-fim | √ | √ | √ |
| SmolLM-135M | √ | √ | √ |
| stablelm-zephyr | x | x | x |
| stablelm-2-zephyr-1_6b | x | x | x |
| starcoderbase-1b | √ | √ | √ |
| starcoder2-3b | √ | √ | √ |
| vigogne-7b-chat | √ | √ | √ |
| xverse-7b-chat | √ | √ | √ |
| Yi-6b-Chat | √ | √ | √ |
## DataType Supports
| DataType | Status |
|:----------------------:|:-------:|
| FP16 | Support |
| Q8_0 | Support |
| Q4_0 | Support |
## Docker
### Build Images
You can get a image with llama.cpp in one command.
```sh
docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
```
### Run container
```sh
# Find all cards.
npu-smi info
# Select the cards that you want to use, make sure these cards are not used by someone.
# Following using cards of device0.
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
```
*Notes:*
- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
## Linux
### I. Setup Environment
1. **Install Ascend Driver and firmware**
```sh
# create driver running user.
sudo groupadd -g HwHiAiUser
sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
sudo usermod -aG HwHiAiUser $USER
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
# and install driver.
sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
```
Once installed, run `npu-smi info` to check whether driver is installed successfully.
```sh
+-------------------------------------------------------------------------------------------+
| npu-smi 24.1.rc2 Version: 24.1.rc2 |
+----------------------+---------------+----------------------------------------------------+
| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)|
| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) |
+======================+===============+====================================================+
| 2 xxx | OK | 64.4 51 15 / 15 |
| 0 | 0000:01:00.0 | 0 1873 / 15077 0 / 32768 |
+======================+===============+====================================================+
| 5 xxx | OK | 64.0 52 15 / 15 |
| 0 | 0000:81:00.0 | 0 1874 / 15077 0 / 32768 |
+======================+===============+====================================================+
| No running processes found in NPU 2 |
+======================+===============+====================================================+
| No running processes found in NPU 5 |
+======================+===============+====================================================+
```
2. **Install Ascend Firmware**
```sh
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
# and install driver.
sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
```
If the following messaage appers, firmware is installed successfully.
```sh
Firmware package installed successfully!
```
3. **Install CANN toolkit and kernels**
CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
```sh
pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
```
Set Ascend Variables:
```sh
echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
source ~/.bashrc
```
Upon a successful installation, CANN is enabled for the available ascend devices.
### II. Build llama.cpp
```sh
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
cmake --build build --config release
```
### III. Run the inference
1. **Retrieve and prepare model**
You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
**Notes**:
- CANN backend only supports FP16/Q4_0/Q8_0 models currently.
2. **Launch inference**
There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically choose the devices with the same backend.
| Device selection | Parameter |
|:----------------:|:--------------------------------------:|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
Examples:
- Use device 0:
```sh
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
- Use multiple devices:
```sh
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
### **GitHub contribution**:
Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
## TODO
- Support more models and data types.

View file

@ -20,7 +20,7 @@
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include: **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers. - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*. - **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs. - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets. - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
@ -28,10 +28,6 @@
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*). The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
## Recommended Release ## Recommended Release
The SYCL backend would be broken by some PRs due to no online CI. The SYCL backend would be broken by some PRs due to no online CI.
@ -45,6 +41,10 @@ The following release is verified with good quality:
## News ## News
- 2024.8
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
- 2024.5 - 2024.5
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770. - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
- Arch Linux is verified successfully. - Arch Linux is verified successfully.
@ -80,7 +80,14 @@ The following release is verified with good quality:
### Intel GPU ### Intel GPU
**Verified devices** SYCL backend supports Intel GPU Family:
- Intel Data Center Max Series
- Intel Flex Series, Arc Series
- Intel Built-in Arc GPU
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
#### Verified devices
| Intel GPU | Status | Verified Model | | Intel GPU | Status | Verified Model |
|-------------------------------|---------|---------------------------------------| |-------------------------------|---------|---------------------------------------|
@ -88,7 +95,7 @@ The following release is verified with good quality:
| Intel Data Center Flex Series | Support | Flex 170 | | Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 | | Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | | Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 | | Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
*Notes:* *Notes:*
@ -189,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable. Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs. Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
- **Adding support to Nvidia GPUs** - **Adding support to Nvidia GPUs**
@ -237,12 +244,17 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic
### II. Build llama.cpp ### II. Build llama.cpp
#### Intel GPU #### Intel GPU
```
./examples/sycl/build.sh
```
or
```sh ```sh
# Export relevant ENV variables # Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
@ -276,23 +288,26 @@ cmake --build build --config Release -j -v
### III. Run the inference ### III. Run the inference
1. Retrieve and prepare model #### Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment ##### Check device
1. Enable oneAPI running environment
```sh ```sh
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
``` ```
3. List devices information 2. List devices information
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```sh ```sh
./build/bin/llama-ls-sycl-device ./build/bin/llama-ls-sycl-device
``` ```
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
``` ```
found 2 SYCL devices: found 2 SYCL devices:
@ -304,12 +319,37 @@ found 2 SYCL devices:
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
``` ```
#### Choose level-zero devices
4. Launch inference |Chosen Device ID|Setting|
|-|-|
|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
#### Execute
Choose one of following methods to run.
1. Script
- Use device 0:
```sh
./examples/sycl/run-llama2.sh 0
```
- Use multiple devices:
```sh
./examples/sycl/run-llama2.sh
```
2. Command line
Launch inference
There are two device selection modes: There are two device selection modes:
- Single device: Use one device target specified by the user. - Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend. - Multiple devices: Automatically choose the devices with the same backend.
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
@ -326,11 +366,6 @@ Examples:
```sh ```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
``` ```
or run by script:
```sh
./examples/sycl/run_llama2.sh 0
```
- Use multiple devices: - Use multiple devices:
@ -338,12 +373,6 @@ or run by script:
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
``` ```
Otherwise, you can run the script:
```sh
./examples/sycl/run_llama2.sh
```
*Notes:* *Notes:*
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
@ -390,7 +419,7 @@ c. Verify installation
In the oneAPI command line, run the following to print the available SYCL devices: In the oneAPI command line, run the following to print the available SYCL devices:
``` ```
sycl-ls sycl-ls.exe
``` ```
There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device: There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
@ -411,6 +440,18 @@ b. The new Visual Studio will install Ninja as default. (If not, please install
### II. Build llama.cpp ### II. Build llama.cpp
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
Choose one of following methods to build from source code.
1. Script
```sh
.\examples\sycl\win-build-sycl.bat
```
2. CMake
On the oneAPI command line window, step into the llama.cpp main directory and run the following: On the oneAPI command line window, step into the llama.cpp main directory and run the following:
``` ```
@ -425,12 +466,8 @@ cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPI
cmake --build build --config Release -j cmake --build build --config Release -j
``` ```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
```sh
.\examples\sycl\win-build-sycl.bat
```
Or, use CMake presets to build: Or, use CMake presets to build:
```sh ```sh
cmake --preset x64-windows-sycl-release cmake --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli cmake --build build-x64-windows-sycl-release -j --target llama-cli
@ -442,7 +479,9 @@ cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-cli cmake --build build-x64-windows-sycl-debug -j --target llama-cli
``` ```
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. 3. Visual Studio
You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
*Notes:* *Notes:*
@ -450,23 +489,25 @@ Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choos
### III. Run the inference ### III. Run the inference
1. Retrieve and prepare model #### Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment ##### Check device
1. Enable oneAPI running environment
On the oneAPI command line window, run the following and step into the llama.cpp directory: On the oneAPI command line window, run the following and step into the llama.cpp directory:
``` ```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
``` ```
3. List devices information 2. List devices information
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
``` ```
build\bin\ls-sycl-device.exe build\bin\llama-ls-sycl-device.exe
``` ```
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
@ -479,9 +520,27 @@ found 2 SYCL devices:
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
``` ```
#### Choose level-zero devices
|Chosen Device ID|Setting|
|-|-|
|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
4. Launch inference #### Execute
Choose one of following methods to run.
1. Script
```
examples\sycl\win-run-llama2.bat
```
2. Command line
Launch inference
There are two device selection modes: There are two device selection modes:
@ -508,11 +567,7 @@ build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website ca
``` ```
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
``` ```
Otherwise, run the following wrapper script:
```
.\examples\sycl\win-run-llama2.bat
```
Note: Note:
@ -526,17 +581,18 @@ Or
use 1 SYCL GPUs: [0] with Max compute units:512 use 1 SYCL GPUs: [0] with Max compute units:512
``` ```
## Environment Variable ## Environment Variable
#### Build #### Build
| Name | Value | Function | | Name | Value | Function |
|--------------------|-----------------------------------|---------------------------------------------| |--------------------|-----------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. | | GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
#### Runtime #### Runtime
@ -572,9 +628,26 @@ use 1 SYCL GPUs: [0] with Max compute units:512
``` ```
Otherwise, please double-check the GPU driver installation steps. Otherwise, please double-check the GPU driver installation steps.
- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
It's same for other projects including llama.cpp SYCL backend.
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
Device Memory is not enough.
|Reason|Solution|
|-|-|
|Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
|Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
### **GitHub contribution**: ### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay. Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
## TODO ## TODO
- Support row layer split for multiple card runs. - NA

View file

@ -352,6 +352,37 @@ cmake --build build --config Release
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
``` ```
### CANN
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
Go to `llama.cpp` directory and build using CMake.
```bash
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
cmake --build build --config release
```
You can test with:
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
```bash
llm_load_tensors: CANN buffer size = 13313.00 MiB
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
```
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
### Android ### Android
To read documentation for how to build on Android, [click here](./android.md) To read documentation for how to build on Android, [click here](./android.md)
### Arm CPU optimized mulmat kernels
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).

View file

@ -20,7 +20,7 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
## Usage ## Usage
@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
The defaults are: The defaults are:
- `CUDA_VERSION` set to `11.7.1` - `CUDA_VERSION` set to `12.6.0`
- `CUDA_DOCKER_ARCH` set to `all` - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
The resulting images, are essentially the same as the non-CUDA images: The resulting images, are essentially the same as the non-CUDA images:

View file

@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
#endif #endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) { if (plan.work_size > 0) {
buf.resize(plan.work_size); buf.resize(plan.work_size);

View file

@ -49,3 +49,12 @@ There are 2 modes of operation:
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 | | 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 | | 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 | | 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
### JSONL output
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
```json lines
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
```

View file

@ -1,49 +1,28 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
#include <cmath>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
// mutates the input string static void print_usage(int, char ** argv) {
static std::vector<int> parse_list(char * p) { LOG("\nexample usage:\n");
std::vector<int> ret; LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG("\n");
char * q = p;
while (*p) {
if (*p == ',') {
*p = '\0';
ret.push_back(std::atoi(q));
q = p + 1;
}
++p;
}
ret.push_back(std::atoi(q));
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG_TEE("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
int is_pp_shared = params.is_pp_shared; int is_pp_shared = params.is_pp_shared;
std::vector<int> n_pp = params.n_pp; std::vector<int> n_pp = params.n_pp;
@ -100,7 +79,7 @@ int main(int argc, char ** argv) {
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
if (ret != 0) { if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false; return false;
} }
@ -117,17 +96,18 @@ int main(int argc, char ** argv) {
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
} }
LOG_TEE("\n"); if (!params.batched_bench_output_jsonl) {
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG("\n");
LOG_TEE("\n"); LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
}
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) { for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@ -156,7 +136,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@ -178,7 +158,7 @@ int main(int argc, char ** argv) {
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
} }
@ -195,12 +175,22 @@ int main(int argc, char ** argv) {
const float speed_tg = pl*tg / t_tg; const float speed_tg = pl*tg / t_tg;
const float speed = n_kv / t; const float speed = n_kv / t;
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); if(params.batched_bench_output_jsonl) {
LOG(
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
);
} else {
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
}
} }
} }
} }
llama_print_timings(ctx); LOG("\n");
llama_perf_context_print(ctx);
llama_batch_free(batch); llama_batch_free(batch);
@ -209,7 +199,7 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
print("Failed to load model") print("Failed to load model")
exit(1) exit(1)
} }
defer { defer {
llama_free_model(model) llama_free_model(model)
} }
@ -37,7 +36,6 @@ var tokens = tokenize(text: prompt, add_bos: true)
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
var context_params = llama_context_default_params() var context_params = llama_context_default_params()
context_params.seed = 1234
context_params.n_ctx = n_kv_req context_params.n_ctx = n_kv_req
context_params.n_batch = UInt32(max(n_len, n_parallel)) context_params.n_batch = UInt32(max(n_len, n_parallel))
context_params.n_threads = 8 context_params.n_threads = 8
@ -48,11 +46,26 @@ guard context != nil else {
print("Failed to initialize context") print("Failed to initialize context")
exit(1) exit(1)
} }
defer { defer {
llama_free(context) llama_free(context)
} }
var sparams = llama_sampler_chain_default_params()
let smpl = llama_sampler_chain_init(sparams)
guard smpl != nil else {
print("Failed to initialize sampling")
exit(1)
}
defer {
llama_sampler_free(smpl)
}
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
let n_ctx = llama_n_ctx(context) let n_ctx = llama_n_ctx(context)
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
@ -125,32 +138,7 @@ while n_cur <= n_len {
continue continue
} }
var n_vocab = llama_n_vocab(model) let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
var logits = llama_get_logits_ith(context, i_batch[i])
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
for token_id in 0 ..< n_vocab {
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
}
var candidates_p: llama_token_data_array = .init(
data: &candidates,
size: candidates.count,
sorted: false
)
let top_k: Int32 = 40
let top_p: Float = 0.9
let temp: Float = 0.4
llama_sample_top_k(context, &candidates_p, top_k, 1)
llama_sample_top_p(context, &candidates_p, top_p, 1)
llama_sample_temp(context, &candidates_p, temp)
let new_token_id = llama_sample_token(context, &candidates_p)
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished // is it an end of stream? -> mark the stream as finished
if llama_token_is_eog(model, new_token_id) || n_cur == n_len { if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
@ -210,9 +198,10 @@ if n_parallel > 1 {
let t_main_end = ggml_time_us() let t_main_end = ggml_time_us()
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n") print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
llama_print_timings(context) llama_perf_sampler_print(smpl)
llama_perf_context_print(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count let utf8Count = text.utf8.count

View file

@ -1,18 +1,17 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
#include <cmath>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) { static void print_usage(int, char ** argv) {
gpt_params_print_usage(argc, argv, params); LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\nexample usage:\n"); LOG("\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -21,11 +20,11 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
params.n_predict = 32; params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
// number of parallel batches // number of parallel batches
int n_parallel = params.n_parallel; int n_parallel = params.n_parallel;
@ -45,7 +44,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1; return 1;
} }
@ -65,32 +64,39 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
auto sparams = llama_sampler_chain_default_params();
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
return 1; return 1;
} }
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens // make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) { if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
return 1; return 1;
} }
// print the prompt token-by-token // print the prompt token-by-token
fprintf(stderr, "\n"); LOG("\n");
for (auto id : tokens_list) { for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", llama_token_to_piece(ctx, id).c_str());
} }
fflush(stderr);
// create a llama_batch // create a llama_batch
// we use this object to submit token data for decoding // we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@ -108,7 +114,7 @@ int main(int argc, char ** argv) {
if (llama_model_has_encoder(model)) { if (llama_model_has_encoder(model)) {
if (llama_encode(ctx, batch)) { if (llama_encode(ctx, batch)) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return 1; return 1;
} }
@ -125,7 +131,7 @@ int main(int argc, char ** argv) {
batch.logits[batch.n_tokens - 1] = true; batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@ -136,7 +142,7 @@ int main(int argc, char ** argv) {
//} //}
if (n_parallel > 1) { if (n_parallel > 1) {
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
} }
// main loop // main loop
@ -164,36 +170,14 @@ int main(int argc, char ** argv) {
continue; continue;
} }
auto n_vocab = llama_n_vocab(model); const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
const int top_k = 40;
const float top_p = 0.9f;
const float temp = 0.4f;
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temp (ctx, &candidates_p, temp);
const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1; i_batch[i] = -1;
LOG_TEE("\n"); LOG("\n");
if (n_parallel > 1) { if (n_parallel > 1) {
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
} }
continue; continue;
@ -201,8 +185,7 @@ int main(int argc, char ** argv) {
// if there is only one stream, we print immediately to stdout // if there is only one stream, we print immediately to stdout
if (n_parallel == 1) { if (n_parallel == 1) {
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
} }
streams[i] += llama_token_to_piece(ctx, new_token_id); streams[i] += llama_token_to_piece(ctx, new_token_id);
@ -224,32 +207,33 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model // evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) { if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1; return 1;
} }
} }
LOG_TEE("\n");
if (n_parallel > 1) { if (n_parallel > 1) {
LOG_TEE("\n"); LOG("\n");
for (int32_t i = 0; i < n_parallel; ++i) { for (int32_t i = 0; i < n_parallel; ++i) {
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
} }
} }
const auto t_main_end = ggml_time_us(); const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
llama_print_timings(ctx); LOG("\n");
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
llama_batch_free(batch); llama_batch_free(batch);
llama_sampler_free(smpl);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View file

@ -21,7 +21,7 @@
#endif #endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) { if (plan.work_size > 0) {
buf.resize(plan.work_size); buf.resize(plan.work_size);
@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct { struct benchmark_params_struct {
int32_t n_threads = 1; int n_threads = 1;
int32_t n_iterations = 10; int32_t n_iterations = 10;
}; };
@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
TENSOR_DUMP(gf->nodes[0]); TENSOR_DUMP(ggml_graph_node(gf, 0));
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
// Let's use the F32 result from above as a reference for the quantized multiplication // Let's use the F32 result from above as a reference for the quantized multiplication
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n"); printf("=====================================================================================\n");
@ -252,7 +252,7 @@ int main(int argc, char ** argv) {
// Check that the matrix multiplication result is in the right ballpark // Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6

View file

@ -9,6 +9,7 @@
#include <climits> #include <climits>
#include <cstring> #include <cstring>
#include <cstdarg> #include <cstdarg>
#include <cinttypes>
#include <ctime> #include <ctime>
#include <random> #include <random>
#include <stdexcept> #include <stdexcept>
@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
try { try {
w->token_embedding_table.resize(p->vocab_size * p->dim); w->token_embedding_table.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight.resize(p->n_layers * p->dim); w->rms_att_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight.resize(p->n_layers * p->dim); w->rms_ffn_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq.resize(p->n_layers * p->dim * p->dim); w->wq.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wo.resize(p->n_layers * p->dim * p->dim); w->wo.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1.resize(p->n_layers * p->hidden_dim * p->dim); w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2.resize(p->n_layers * p->hidden_dim * p->dim); w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3.resize(p->n_layers * p->hidden_dim * p->dim); w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight.resize(p->dim); w->rms_final_weight.resize(p->dim);
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) { if (shared_weights) {
w->wcls = {}; w->wcls = {};
} else { } else {
w->wcls.resize(p->vocab_size * p->dim); w->wcls.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
} }
} }
catch (std::length_error &) { catch (std::length_error &) {
@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
fseek(f, 0, SEEK_END); fseek(f, 0, SEEK_END);
auto end = ftell(f); auto end = ftell(f);
if (curr != end) { if (curr != end) {
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
return 1; return 1;
} }
@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
} }
static void print_sample_weights(TransformerWeights *w){ static void print_sample_weights(TransformerWeights *w){
LOG("----- Quick print of first of the weight vales of all the variables\n"); LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
LOG("%f\n", w->token_embedding_table[0]); LOG_INF("%f\n", w->token_embedding_table[0]);
LOG("%f\n", w->rms_att_weight[0]); LOG_INF("%f\n", w->rms_att_weight[0]);
LOG("%f\n", w->rms_ffn_weight[0]); LOG_INF("%f\n", w->rms_ffn_weight[0]);
LOG("%f\n", w->wq[0]); LOG_INF("%f\n", w->wq[0]);
LOG("%f\n", w->wk[0]); LOG_INF("%f\n", w->wk[0]);
LOG("%f\n", w->wv[0]); LOG_INF("%f\n", w->wv[0]);
LOG("%f\n", w->wo[0]); LOG_INF("%f\n", w->wo[0]);
LOG("%f\n", w->w1[0]); LOG_INF("%f\n", w->w1[0]);
LOG("%f\n", w->w2[0]); LOG_INF("%f\n", w->w2[0]);
LOG("%f\n", w->w3[0]); LOG_INF("%f\n", w->w3[0]);
LOG("%f\n", w->rms_att_weight[0]); LOG_INF("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -318,20 +319,20 @@ struct train_params {
}; };
static void print_params(struct my_llama_hparams * params) { static void print_params(struct my_llama_hparams * params) {
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG("%s: n_embd: %u\n", __func__, params->n_embd); LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
LOG("%s: n_mult: %u\n", __func__, params->n_mult); LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
LOG("%s: n_head: %u\n", __func__, params->n_head); LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG("%s: n_ff: %u\n", __func__, params->n_ff); LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
LOG("%s: n_layer: %u\n", __func__, params->n_layer); LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
LOG("%s: n_rot: %u\n", __func__, params->n_rot); LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
} }
static void print_tensor_info(const struct ggml_context * ctx) { static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
LOG("%s: Allocating ", __func__); LOG_INF("%s: Allocating ", __func__);
int64_t total = 1; int64_t total = 1;
int i = 0; int i = 0;
for (; i < ggml_n_dims(t); ++i) { for (; i < ggml_n_dims(t); ++i) {
@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
if (is_ggml_file(filename)) { if (is_ggml_file(filename)) {
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {
@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
gguf_free(ctx); gguf_free(ctx);
} else { } else {
// assume llama2.c vocabulary // assume llama2.c vocabulary
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (!file.fp) { if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename); die_fmt("%s: %s", strerror(errno), filename);
@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_init();
struct train_params params = get_default_train_params(); struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, &params)) { if (!params_parse(argc, argv, &params)) {
return 1; return 1;
} }
log_set_target(stdout);
Config config; Config config;
TransformerWeights weights = {}; TransformerWeights weights = {};
{ {
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
FILE * file = fopen(params.fn_llama2c_model, "rb"); FILE * file = fopen(params.fn_llama2c_model, "rb");
if (!file) { if (!file) {
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
return 1; return 1;
} }
// read in the config header // read in the config header
if (fread(&config, sizeof(Config), 1, file) != 1) { if (fread(&config, sizeof(Config), 1, file) != 1) {
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
return 1; return 1;
} }
auto shared_weights = config.vocab_size > 0; auto shared_weights = config.vocab_size > 0;
@ -896,7 +899,7 @@ int main(int argc, char ** argv) {
// read in the Transformer weights // read in the Transformer weights
alloc_weights(&weights, &config, shared_weights); alloc_weights(&weights, &config, shared_weights);
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
return 1; return 1;
} }
fclose(file); fclose(file);
@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
model.name = basename(params.fn_llama2c_model); model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx); ggml_free(model.ctx);
return 0; return 0;

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
@ -12,14 +13,15 @@
#include "ggml-metal.h" #include "ggml-metal.h"
#endif #endif
#include <algorithm>
#include <climits>
#include <cstdio> #include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <climits>
////////////////////////////////////////////////// //////////////////////////////////////////////////
@ -35,9 +37,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
return ret; return ret;
} }
static void print_usage(int argc, char ** argv, const gpt_params & params) { static void print_usage(int, char ** argv) {
gpt_params_print_usage(argc, argv, params);
printf("\nexample usage:\n"); printf("\nexample usage:\n");
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@ -271,7 +271,7 @@ struct tokenized_prompt {
size_t max_seq_len; size_t max_seq_len;
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@ -390,8 +390,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
@ -486,8 +485,8 @@ int main(int argc, char ** argv) {
if (use_pca) { if (use_pca) {
// run PCA // run PCA
PCA::pca_params pca_params; PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads; pca_params.n_threads = params.cpuparams.n_threads;
pca_params.n_batch = params.n_pca_batch; pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations; pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
} else { } else {

View file

@ -12,12 +12,9 @@
#include <cstdio> #include <cstdio>
#include <ctime> #include <ctime>
#include <random>
#include <string> #include <string>
#include <tuple>
#include <vector> #include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#define DEBUG_POS 5 #define DEBUG_POS 5
@ -229,8 +226,8 @@ static ggml_status compute_piter(
result.eigenvectors.resize(params.n_batch); result.eigenvectors.resize(params.n_batch);
result.distances.resize(params.n_batch); result.distances.resize(params.n_batch);
// get output nodes // get output nodes
for (int i = 0; i < gf->n_nodes; ++i) { for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
auto node = gf->nodes[i]; auto node = ggml_graph_node(gf, i);
int iter = -1; int iter = -1;
// find b_tensor (without copying data from device) // find b_tensor (without copying data from device)
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {

View file

@ -1,4 +1,6 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <ctime> #include <ctime>
@ -38,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
// run model // run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) { if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
// encoder-only model // encoder-only model
if (llama_encode(ctx, batch) < 0) { if (llama_encode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to encode\n", __func__); LOG_ERR("%s : failed to encode\n", __func__);
} }
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) { } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
// decoder-only model // decoder-only model
if (llama_decode(ctx, batch) < 0) { if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__); LOG_ERR("%s : failed to decode\n", __func__);
} }
} }
@ -79,25 +81,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
params.embedding = true; params.embedding = true;
// For non-causal models, batch size must be equal to ubatch size // For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch; params.n_ubatch = params.n_batch;
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -107,7 +100,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);
return 1; return 1;
} }
@ -117,19 +110,19 @@ int main(int argc, char ** argv) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) { if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__); LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
return 1; return 1;
} }
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx); __func__, n_ctx_train, n_ctx);
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_INF("\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
} }
// split the prompt into lines // split the prompt into lines
@ -144,7 +137,7 @@ int main(int argc, char ** argv) {
for (const auto & prompt : prompts) { for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, false); auto inp = ::llama_tokenize(ctx, prompt, true, false);
if (inp.size() > n_batch) { if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch); __func__, (long long int) inp.size(), (long long int) n_batch);
return 1; return 1;
} }
@ -155,20 +148,20 @@ int main(int argc, char ** argv) {
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) { for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) { if (inp.empty() || inp.back() != llama_token_sep(model)) {
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__); LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
} }
} }
// tokenization stats // tokenization stats
if (params.verbose_prompt) { if (params.verbose_prompt) {
for (int i = 0; i < (int) inputs.size(); i++) { for (int i = 0; i < (int) inputs.size(); i++) {
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
for (int j = 0; j < (int) inputs[i].size(); j++) { for (int j = 0; j < (int) inputs[i].size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
} }
fprintf(stderr, "\n\n"); LOG("\n\n");
} }
} }
@ -219,57 +212,57 @@ int main(int argc, char ** argv) {
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
if (params.embd_out.empty()) { if (params.embd_out.empty()) {
fprintf(stdout, "\n"); LOG("\n");
if (pooling_type == LLAMA_POOLING_TYPE_NONE) { if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
for (int j = 0; j < n_embd_count; j++) { for (int j = 0; j < n_embd_count; j++) {
fprintf(stdout, "embedding %d: ", j); LOG("embedding %d: ", j);
for (int i = 0; i < std::min(3, n_embd); i++) { for (int i = 0; i < std::min(3, n_embd); i++) {
if (params.embd_normalize == 0) { if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); LOG("%6.0f ", emb[j * n_embd + i]);
} else { } else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); LOG("%9.6f ", emb[j * n_embd + i]);
} }
} }
fprintf(stdout, " ... "); LOG(" ... ");
for (int i = n_embd - 3; i < n_embd; i++) { for (int i = n_embd - 3; i < n_embd; i++) {
if (params.embd_normalize == 0) { if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); LOG("%6.0f ", emb[j * n_embd + i]);
} else { } else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); LOG("%9.6f ", emb[j * n_embd + i]);
} }
} }
fprintf(stdout, "\n"); LOG("\n");
} }
} else { } else {
// print the first part of the embeddings or for a single prompt, the full embedding // print the first part of the embeddings or for a single prompt, the full embedding
for (int j = 0; j < n_prompts; j++) { for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j); LOG("embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (params.embd_normalize == 0) { if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); LOG("%6.0f ", emb[j * n_embd + i]);
} else { } else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); LOG("%9.6f ", emb[j * n_embd + i]);
} }
} }
fprintf(stdout, "\n"); LOG("\n");
} }
// print cosine similarity matrix // print cosine similarity matrix
if (n_prompts > 1) { if (n_prompts > 1) {
fprintf(stdout, "\n"); LOG("\n");
printf("cosine similarity matrix:\n\n"); LOG("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) { for (int i = 0; i < n_prompts; i++) {
fprintf(stdout, "%6.6s ", prompts[i].c_str()); LOG("%6.6s ", prompts[i].c_str());
} }
fprintf(stdout, "\n"); LOG("\n");
for (int i = 0; i < n_prompts; i++) { for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) { for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim); LOG("%6.2f ", sim);
} }
fprintf(stdout, "%1.10s", prompts[i].c_str()); LOG("%1.10s", prompts[i].c_str());
fprintf(stdout, "\n"); LOG("\n");
} }
} }
} }
@ -278,43 +271,45 @@ int main(int argc, char ** argv) {
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") { if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
const bool notArray = params.embd_out != "array"; const bool notArray = params.embd_out != "array";
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
for (int j = 0;;) { // at least one iteration (one prompt) for (int j = 0;;) { // at least one iteration (one prompt)
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
fprintf(stdout, "["); LOG("[");
for (int i = 0;;) { // at least one iteration (n_embd > 0) for (int i = 0;;) { // at least one iteration (n_embd > 0)
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++; i++;
if (i < n_embd) fprintf(stdout, ","); else break; if (i < n_embd) LOG(","); else break;
} }
fprintf(stdout, notArray ? "]\n }" : "]"); LOG(notArray ? "]\n }" : "]");
j++; j++;
if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break; if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
} }
fprintf(stdout, notArray ? "\n ]" : "]\n"); LOG(notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) { if (params.embd_out == "json+" && n_prompts > 1) {
fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); LOG(",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_embd_count > 1) for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
fprintf(stdout, " ["); LOG(" [");
for (int j = 0;;) { // at least two iteration (n_embd_count > 1) for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f", sim); LOG("%6.2f", sim);
j++; j++;
if (j < n_embd_count) fprintf(stdout, ", "); else break; if (j < n_embd_count) LOG(", "); else break;
} }
fprintf(stdout, " ]"); LOG(" ]");
i++; i++;
if (i < n_embd_count) fprintf(stdout, ",\n"); else break; if (i < n_embd_count) LOG(",\n"); else break;
} }
fprintf(stdout, "\n ]"); LOG("\n ]");
} }
if (notArray) fprintf(stdout, "\n}\n"); if (notArray) LOG("\n}\n");
} }
LOG("\n");
llama_perf_context_print(ctx);
// clean up // clean up
llama_print_timings(ctx);
llama_batch_free(batch); llama_batch_free(batch);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View file

@ -1,11 +1,11 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
#include <cstdio> #include <cstdio>
#include <random>
#include <string> #include <string>
#include <tuple>
#include <vector> #include <vector>
/** /**
@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
GGML_ASSERT(n > 0); GGML_ASSERT(n > 0);
float sum = 0; float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) { for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf(" [\n"); LOG(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) { for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) { if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n"); LOG(" ..., \n");
i2 = ne[2] - n; i2 = ne[2] - n;
} }
printf(" [\n"); LOG(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) { for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) { if (i1 == n && ne[1] > 2*n) {
printf(" ..., \n"); LOG(" ..., \n");
i1 = ne[1] - n; i1 = ne[1] - n;
} }
printf(" ["); LOG(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) { for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) { if (i0 == n && ne[0] > 2*n) {
printf("..., "); LOG("..., ");
i0 = ne[0] - n; i0 = ne[0] - n;
} }
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
} else { } else {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
printf("%12.4f", v); LOG("%12.4f", v);
sum += v; sum += v;
if (i0 < ne[0] - 1) printf(", "); if (i0 < ne[0] - 1) LOG(", ");
} }
printf("],\n"); LOG("],\n");
} }
printf(" ],\n"); LOG(" ],\n");
} }
printf(" ]\n"); LOG(" ]\n");
printf(" sum = %f\n", sum); LOG(" sum = %f\n", sum);
} }
} }
@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
} }
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t), t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(), src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "", src1 ? src1_str : "",
ggml_ne_string(t).c_str()); ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed // copy the data from the GPU memory if needed
@ -127,12 +127,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
} }
static bool run(llama_context * ctx, const gpt_params & params) { static bool run(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
} }
@ -144,14 +144,11 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
print_build_info(); gpt_init();
std::mt19937 rng(params.seed);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -168,14 +165,15 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__); LOG_ERR("%s : failed to init\n", __func__);
return 1; return 1;
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_INF("\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
} }
bool OK = run(ctx, params); bool OK = run(ctx, params);
@ -183,7 +181,8 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
llama_print_timings(ctx); LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View file

@ -17,9 +17,9 @@ For example:
```bash ```bash
./bin/llama-export-lora \ ./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \ -m open-llama-3b-v2.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ -o open-llama-3b-v2-english2tokipona-chat.gguf \
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
``` ```
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
@ -10,6 +11,12 @@
static bool g_verbose = false; static bool g_verbose = false;
struct tensor_transformation {
struct ggml_tensor * in;
struct ggml_tensor * out;
bool is_copy;
};
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
int id = gguf_find_key(ctx_gguf, key.c_str()); int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
@ -198,8 +205,7 @@ struct lora_merge_ctx {
} }
// mapping base tensor to out tensor (same shape with base, but different type) // mapping base tensor to out tensor (same shape with base, but different type)
// if out_tensor == nullptr, we only copy it std::vector<tensor_transformation> trans;
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
for (auto & it : base_model.tensors) { for (auto & it : base_model.tensors) {
bool t_a = true; bool t_a = true;
bool t_b = true; bool t_b = true;
@ -212,14 +218,22 @@ struct lora_merge_ctx {
// only copy // only copy
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
ggml_set_name(cpy_tensor, base_tensor->name); ggml_set_name(cpy_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr)); trans.push_back({
cpy_tensor,
cpy_tensor,
true,
});
gguf_add_tensor(ctx_out, cpy_tensor); gguf_add_tensor(ctx_out, cpy_tensor);
} else if (t_a && t_b) { } else if (t_a && t_b) {
// need merging // need merging
struct ggml_tensor * out_tensor = ggml_new_tensor( struct ggml_tensor * out_tensor = ggml_new_tensor(
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
ggml_set_name(out_tensor, base_tensor->name); ggml_set_name(out_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor)); trans.push_back({
base_tensor,
out_tensor,
false,
});
gguf_add_tensor(ctx_out, out_tensor); gguf_add_tensor(ctx_out, out_tensor);
} else { } else {
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
@ -234,12 +248,12 @@ struct lora_merge_ctx {
// process base model tensors // process base model tensors
size_t n_merged = 0; size_t n_merged = 0;
for (auto & it : base_to_out_tensors) { for (auto & it : trans) {
if (it.second != nullptr) { if (!it.is_copy) {
merge_tensor(it.first, it.second); merge_tensor(it.in, it.out);
n_merged++; n_merged++;
} else { } else {
copy_tensor(it.first); copy_tensor(it.in);
} }
} }
@ -252,7 +266,7 @@ struct lora_merge_ctx {
} }
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size()); printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
} }
void copy_tensor(struct ggml_tensor * base) { void copy_tensor(struct ggml_tensor * base) {
@ -285,6 +299,10 @@ struct lora_merge_ctx {
for (size_t i = 0; i < adapters.size(); ++i) { for (size_t i = 0; i < adapters.size(); ++i) {
auto t_a = adapters[i]->get_tensor(name_lora_a); auto t_a = adapters[i]->get_tensor(name_lora_a);
auto t_b = adapters[i]->get_tensor(name_lora_b); auto t_b = adapters[i]->get_tensor(name_lora_b);
// TODO: add support for quantized lora
if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
}
inp_a[i] = ggml_dup_tensor(ctx, t_a); inp_a[i] = ggml_dup_tensor(ctx, t_a);
inp_b[i] = ggml_dup_tensor(ctx, t_b); inp_b[i] = ggml_dup_tensor(ctx, t_b);
} }
@ -352,7 +370,7 @@ struct lora_merge_ctx {
// write data to output file // write data to output file
{ {
auto result = gf->nodes[gf->n_nodes - 1]; auto * result = ggml_graph_node(gf, -1);
size_t len = ggml_nbytes(result); size_t len = ggml_nbytes(result);
if (read_buf.size() < len) { if (read_buf.size() < len) {
read_buf.resize(len); read_buf.resize(len);
@ -374,9 +392,7 @@ struct lora_merge_ctx {
} }
}; };
static void print_usage(int argc, char ** argv, const gpt_params & params) { static void print_usage(int, char ** argv) {
gpt_params_print_usage(argc, argv, params);
printf("\nexample usage:\n"); printf("\nexample usage:\n");
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
printf("\nNOTE: output model is F16\n"); printf("\nNOTE: output model is F16\n");
@ -386,14 +402,13 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
g_verbose = (params.verbosity == 1); g_verbose = (params.verbosity > 1);
try { try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads); lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge(); ctx.run_merge();
} catch (const std::exception & err) { } catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what()); fprintf(stderr, "%s\n", err.what());

View file

@ -1,9 +1,5 @@
#define LLAMA_API_INTERNAL
#include "grammar-parser.h"
#include "ggml.h"
#include "llama.h"
#include "unicode.h" #include "unicode.h"
#include "llama-grammar.h"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
@ -12,29 +8,28 @@
#include <string> #include <string>
#include <vector> #include <vector>
static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
auto decoded = decode_utf8(input_str, {}); const auto cpts = unicode_cpts_from_utf8(input_str);
const auto & code_points = decoded.first;
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
size_t pos = 0; size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { for (const auto & cpt : cpts) {
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks); llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
if (cur_stacks.empty()) { if (stacks_cur.empty()) {
error_pos = pos; error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
cur_stacks = prev_stacks; stacks_cur = stacks_prev;
return false; return false;
} }
++pos; ++pos;
} }
for (const auto & stack : cur_stacks) { for (const auto & stack : stacks_cur) {
if (stack.empty()) { if (stack.empty()) {
return true; return true;
} }
@ -85,27 +80,7 @@ int main(int argc, char** argv) {
grammar_str = buffer.str(); grammar_str = buffer.str();
} }
// Parse the GBNF grammar llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
fprintf(stdout, "%s: failed to parse grammar\n", __func__);
return 1;
}
// Ensure that there is a "root" node.
if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
// Create the LLAMA grammar
auto grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) { if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar"); throw std::runtime_error("Failed to initialize llama_grammar");
} }
@ -122,7 +97,7 @@ int main(int argc, char** argv) {
// Validate the input string against the grammar // Validate the input string against the grammar
size_t error_pos; size_t error_pos;
std::string error_msg; std::string error_msg;
bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg); bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
if (is_valid) { if (is_valid) {
fprintf(stdout, "Input string is valid according to the grammar.\n"); fprintf(stdout, "Input string is valid according to the grammar.\n");
@ -131,7 +106,7 @@ int main(int argc, char** argv) {
} }
// Clean up // Clean up
llama_grammar_free(grammar); llama_grammar_free_impl(grammar);
return 0; return 0;
} }

View file

@ -0,0 +1,5 @@
set(TARGET llama-gen-docs)
add_executable(${TARGET} gen-docs.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,83 @@
#include "arg.h"
#include "common.h"
#include <fstream>
#include <string>
// Export usage message (-h) to markdown format
static void write_table_header(std::ofstream & file) {
file << "| Argument | Explanation |\n";
file << "| -------- | ----------- |\n";
}
static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
file << "| `";
// args
for (const auto & arg : opt.args) {
if (arg == opt.args.front()) {
file << arg;
if (opt.args.size() > 1) file << ", ";
} else {
file << arg << (arg != opt.args.back() ? ", " : "");
}
}
// value hint
if (opt.value_hint) {
std::string md_value_hint(opt.value_hint);
string_replace_all(md_value_hint, "|", "\\|");
file << " " << md_value_hint;
}
if (opt.value_hint_2) {
std::string md_value_hint_2(opt.value_hint_2);
string_replace_all(md_value_hint_2, "|", "\\|");
file << " " << md_value_hint_2;
}
// help text
std::string md_help(opt.help);
string_replace_all(md_help, "\n", "<br/>");
string_replace_all(md_help, "|", "\\|");
file << "` | " << md_help << " |\n";
}
static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
write_table_header(file);
for (const auto & opt : opts) {
write_table_entry(file, *opt);
}
}
static void export_md(std::string fname, llama_example ex) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
gpt_params params;
auto ctx_arg = gpt_params_parser_init(params, ex);
std::vector<llama_arg *> common_options;
std::vector<llama_arg *> sparam_options;
std::vector<llama_arg *> specific_options;
for (auto & opt : ctx_arg.options) {
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
if (opt.is_sparam) {
sparam_options.push_back(&opt);
} else if (opt.in_example(ctx_arg.ex)) {
specific_options.push_back(&opt);
} else {
common_options.push_back(&opt);
}
}
file << "**Common params**\n\n";
write_table(file, common_options);
file << "\n\n**Sampling params**\n\n";
write_table(file, sparam_options);
file << "\n\n**Example-specific params**\n\n";
write_table(file, specific_options);
}
int main(int, char **) {
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
return 0;
}

View file

@ -152,7 +152,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
throw std::invalid_argument("error: invalid parameter for argument: " + arg); throw std::invalid_argument("error: invalid parameter for argument: " + arg);
} }
if (argc - arg_idx < 2) { if (argc - arg_idx != 2) {
throw std::invalid_argument("error: bad arguments"); throw std::invalid_argument("error: bad arguments");
} }
@ -389,10 +389,17 @@ static void gguf_merge(const split_params & split_params) {
int n_split = 1; int n_split = 1;
int total_tensors = 0; int total_tensors = 0;
auto * ctx_out = gguf_init_empty(); // avoid overwriting existing output file
if (std::ifstream(split_params.output.c_str())) {
fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
exit(EXIT_FAILURE);
}
std::ofstream fout(split_params.output.c_str(), std::ios::binary); std::ofstream fout(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors fout.exceptions(std::ofstream::failbit); // fail fast on write errors
auto * ctx_out = gguf_init_empty();
std::vector<uint8_t> read_data; std::vector<uint8_t> read_data;
std::vector<ggml_context *> ctx_metas; std::vector<ggml_context *> ctx_metas;
std::vector<gguf_context *> ctx_ggufs; std::vector<gguf_context *> ctx_ggufs;

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -9,7 +10,7 @@
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) { static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
std::vector<std::vector<float>> result; std::vector<std::vector<float>> result;
const llama_model * mdl = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
@ -18,16 +19,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
const std::string input_string = instruction + sentences[i]; const std::string input_string = instruction + sentences[i];
std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false); std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
const int32_t n_toks = inputs.size(); const int32_t n_toks = inputs.size();
// GritLM seems to have EOS = "" // GritLM seems to have EOS = ""
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
// inputs.push_back(llama_token_eos(mdl)); // inputs.push_back(llama_token_eos(model));
// we want to ignore instruction tokens for mean pooling // we want to ignore instruction tokens for mean pooling
const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size(); const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
#ifdef GRIT_DEBUG #ifdef GRIT_DEBUG
// debug tokens - should be matching as referenced in the GritLM sample // debug tokens - should be matching as referenced in the GritLM sample
@ -51,7 +52,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
llama_decode(ctx, batch); llama_decode(ctx, batch);
// get embedding dimensions // get embedding dimensions
uint64_t n_embd = llama_n_embd(mdl); uint64_t n_embd = llama_n_embd(model);
// allocate embedding output // allocate embedding output
std::vector<float> emb_unorm(n_embd, 0.0f); std::vector<float> emb_unorm(n_embd, 0.0f);
@ -92,11 +93,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
return result; return result;
} }
static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) { static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
std::string result; std::string result;
const llama_model * mdl = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
llama_token eos_token = llama_token_eos(mdl); llama_token eos_token = llama_token_eos(model);
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false); llama_set_embeddings(ctx, false);
@ -104,28 +105,24 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true); std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
int32_t i_current_token = 0; int32_t i_current_token = 0;
while (true) { while (true) {
llama_batch_clear(bat); llama_batch_clear(bat);
auto n_inputs = (int32_t)inputs.size(); {
for (int32_t i = 0; i < n_inputs; i++) { const int32_t n_inputs = inputs.size();
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
for (int32_t i = 0; i < n_inputs; i++) {
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
}
} }
inputs.clear(); inputs.clear();
llama_decode(ctx, bat); llama_decode(ctx, bat);
auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl)); llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
auto n_candidates = (int32_t)candidates.size();
for (int32_t token = 0; token < n_candidates; token++) {
candidates[token] = llama_token_data{ token, logits[token], 0.0f };
}
auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
if (token == eos_token) { if (token == eos_token) {
break; break;
} }
@ -157,20 +154,29 @@ static std::string gritlm_instruction(const std::string & instruction) {
int main(int argc, char * argv[]) { int main(int argc, char * argv[]) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
llama_model_params mparams = llama_model_params_from_gpt_params(params); llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context_params cparams = llama_context_params_from_gpt_params(params);
llama_backend_init(); llama_backend_init();
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
// create generation context // create generation context
llama_context * ctx = llama_new_context_with_model(mdl, cparams); llama_context * ctx = llama_new_context_with_model(model, cparams);
auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
// ### Embedding/Representation ### // ### Embedding/Representation ###
// samples taken from: https://github.com/ContextualAI/gritlm#basic // samples taken from: https://github.com/ContextualAI/gritlm#basic
@ -191,7 +197,7 @@ int main(int argc, char * argv[]) {
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const int n_embd = llama_n_embd(mdl); const int n_embd = llama_n_embd(model);
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@ -208,11 +214,12 @@ int main(int argc, char * argv[]) {
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
{ {
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n"; const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
std::string response = generate(ctx, prompt, true); std::string response = generate(ctx, smpl, prompt, true);
} }
llama_sampler_free(smpl);
llama_free(ctx); llama_free(ctx);
llama_free_model(mdl); llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -1,4 +1,6 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cmath> #include <cmath>
@ -17,15 +19,13 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static void print_usage(int argc, char ** argv, const gpt_params & params) { static void print_usage(int, char ** argv) {
gpt_params_print_usage(argc, argv, params); LOG("\nexample usage:\n");
LOG("\n %s \\\n"
LOG_TEE("\nexample usage:\n"); " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n"); LOG("\n");
} }
struct Stats { struct Stats {
@ -126,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.counts.resize(src1->ne[0]*n_as, 0); e.counts.resize(src1->ne[0]*n_as, 0);
} }
else if (e.values.size() != (size_t)src1->ne[0]*n_as) { else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ABORT("fatal error"); exit(1); //GGML_ABORT("fatal error");
} }
if (m_params.verbosity > 1) { LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
}
// loop over all possible experts, regardless if they are used or not in the batch // loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) { for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0]; size_t e_start = ex*src1->ne[0];
@ -152,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.values[e_start + j] += x[j]*x[j]; e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++; e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) { if (!std::isfinite(e.values[e_start + j])) {
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str()); LOG("\n");
LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1); exit(1);
} }
} }
@ -175,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.counts.resize(src1->ne[0], 0); e.counts.resize(src1->ne[0], 0);
} }
else if (e.values.size() != (size_t)src1->ne[0]) { else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ABORT("fatal error"); exit(1); //GGML_ABORT("fatal error");
} }
++e.ncall; ++e.ncall;
if (m_params.verbosity > 1) { LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) { for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0]; const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j]; e.values[j] += x[j]*x[j];
e.counts[j]++; e.counts[j]++;
if (!std::isfinite(e.values[j])) { if (!std::isfinite(e.values[j])) {
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str()); LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
exit(1); exit(1);
} }
} }
@ -240,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
} }
if (n_zeros != 0 && is_first) { if (n_zeros != 0 && is_first) {
fprintf(stderr, "\n"); LOG_INF("\n");
is_first = false; is_first = false;
} }
if (n_zeros == n_all) { if (n_zeros == n_all) {
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
continue; continue;
} }
if (n_zeros > 0) { if (n_zeros > 0) {
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue; continue;
} }
@ -259,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
} }
if (to_store.size() < m_stats.size()) { if (to_store.size() < m_stats.size()) {
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
} }
std::ofstream out(fname, std::ios::binary); std::ofstream out(fname, std::ios::binary);
@ -291,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
out.write(m_params.prompt_file.c_str(), len); out.write(m_params.prompt_file.c_str(), len);
} }
if (m_params.verbosity > 0) { LOGV(1, "\n");
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
} }
bool IMatrixCollector::load_imatrix(const char * fname) { bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(fname, std::ios::binary); std::ifstream in(fname, std::ios::binary);
if (!in) { if (!in) {
printf("%s: failed to open %s\n",__func__, fname); LOG_ERR("%s: failed to open %s\n",__func__, fname);
return false; return false;
} }
int n_entries; int n_entries;
in.read((char*)&n_entries, sizeof(n_entries)); in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) { if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, fname); LOG_ERR("%s: no data in file %s\n", __func__, fname);
return false; return false;
} }
for (int i = 0; i < n_entries; ++i) { for (int i = 0; i < n_entries; ++i) {
@ -313,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
std::vector<char> name_as_vec(len+1); std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len); in.read((char *)name_as_vec.data(), len);
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false; return false;
} }
name_as_vec[len] = 0; name_as_vec[len] = 0;
@ -324,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
int nval; int nval;
in.read((char *)&nval, sizeof(nval)); in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) { if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i); LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
m_stats = {}; m_stats = {};
return false; return false;
} }
@ -337,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
std::vector<float> tmp(nval); std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float)); in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i); LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
m_stats = {}; m_stats = {};
return false; return false;
} }
@ -433,31 +429,30 @@ static void process_logits(
} }
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (params.i_chunk > 0) { if (params.i_chunk > 0) {
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false; return false;
} }
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
} }
if (int(tokens.size()) < 2*n_ctx) { if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
n_ctx); LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return false; return false;
} }
@ -479,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
double nll = 0.0; double nll = 0.0;
double nll2 = 0.0; double nll2 = 0.0;
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1); std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@ -515,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
// TODO: use batch.logits to save computations instead of relying on logits_all == true // TODO: use batch.logits to save computations instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
} }
@ -532,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
if (i == 0) { if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count(); const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk); int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) { if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60)); LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60); total_seconds = total_seconds % (60*60);
} }
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); LOG("%.2f minutes\n", total_seconds / 60.0);
} }
if (params.compute_ppl) { if (params.compute_ppl) {
const int first = n_ctx/2; const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1; count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout); fflush(stdout);
logits.clear(); logits.clear();
} }
} }
printf("\n"); LOG("\n");
if (params.compute_ppl) { if (params.compute_ppl) {
nll2 /= count; nll2 /= count;
@ -563,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
nll2 -= nll * nll; nll2 -= nll * nll;
if (nll2 > 0) { if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1)); nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else { } else {
printf("Unexpected negative standard deviation of log(prob)\n"); LOG("Unexpected negative standard deviation of log(prob)\n");
} }
} }
@ -577,27 +572,28 @@ int main(int argc, char ** argv) {
params.n_ctx = 512; params.n_ctx = 512;
params.logits_all = true; params.logits_all = true;
params.verbosity = 1; params.escape = false;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
params.n_batch = std::min(params.n_batch, params.n_ctx); params.n_batch = std::min(params.n_batch, params.n_ctx);
g_collector.set_params(params); g_collector.set_params(params);
for (const auto & in_file : params.in_files) { for (const auto & in_file : params.in_files) {
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
if (!g_collector.load_imatrix(in_file.c_str())) { if (!g_collector.load_imatrix(in_file.c_str())) {
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str()); LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
return 1; return 1;
} }
} }
if (params.in_files.size() > 1) { if (params.in_files.size() > 1) {
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix(); g_collector.save_imatrix();
} }
@ -616,20 +612,20 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__); LOG_ERR("%s : failed to init\n", __func__);
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) { if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx); __func__, n_ctx_train, params.n_ctx);
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_INF("\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
} }
if (!compute_imatrix(ctx, params)) { if (!compute_imatrix(ctx, params)) {
@ -638,7 +634,8 @@ int main(int argc, char ** argv) {
g_collector.save_imatrix(); g_collector.save_imatrix();
llama_print_timings(ctx); LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);

View file

@ -1,8 +1,9 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
#include "sampling.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include "grammar-parser.h"
#include <cassert> #include <cassert>
#include <cinttypes> #include <cinttypes>
@ -34,6 +35,7 @@
static llama_context ** g_ctx; static llama_context ** g_ctx;
static llama_model ** g_model; static llama_model ** g_model;
static gpt_sampler ** g_smpl;
static gpt_params * g_params; static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens; static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss; static std::ostringstream * g_output_ss;
@ -54,7 +56,7 @@ static void write_logfile(
const bool success = fs_create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
return; return;
} }
@ -63,7 +65,7 @@ static void write_logfile(
FILE * logfile = fopen(logfile_path.c_str(), "w"); FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) { if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return; return;
} }
@ -81,7 +83,7 @@ static void write_logfile(
yaml_dump_string_multiline(logfile, "output", output.c_str()); yaml_dump_string_multiline(logfile, "output", output.c_str());
yaml_dump_vector_int(logfile, "output_tokens", output_tokens); yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx); llama_perf_dump_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
} }
@ -92,9 +94,14 @@ static void sigint_handler(int signo) {
is_interacting = true; is_interacting = true;
} else { } else {
console::cleanup(); console::cleanup();
printf("\n"); LOG("\n");
llama_print_timings(*g_ctx); gpt_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
// make sure all logs are flushed
LOG("Interrupted by user\n");
gpt_log_pause(gpt_log_main());
_exit(130); _exit(130);
} }
} }
@ -103,109 +110,95 @@ static void sigint_handler(int signo) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
llama_sampling_params & sparams = params.sparams;
g_params = &params; g_params = &params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
#ifndef LOG_DISABLE_LOGS gpt_init();
log_set_target(log_filename_generator("infill", "log"));
LOG_TEE("Log start\n"); auto & sparams = params.sparams;
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
console::init(params.simple_io, params.use_color); console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); }); atexit([]() { console::cleanup(); });
if (params.logits_all) { if (params.logits_all) {
printf("\n************\n"); LOG_ERR("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n"); LOG_ERR("************\n\n");
return 0; return 0;
} }
if (params.embedding) { if (params.embedding) {
printf("\n************\n"); LOG_ERR("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n"); LOG_ERR("************\n\n");
return 0; return 0;
} }
if (params.n_ctx != 0 && params.n_ctx < 8) { if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8; params.n_ctx = 8;
} }
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n"); LOG_ERR("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
printf("************\n\n"); LOG_ERR("************\n\n");
return 0; return 0;
} }
if (params.rope_freq_base != 0.0) { if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
} }
if (params.rope_freq_scale != 0.0) { if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
} }
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); LOG_INF("%s: llama backend init\n", __func__);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
LOG("%s: llama backend init\n", __func__);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
llama_model * model; llama_model * model = nullptr;
llama_context * ctx; llama_context * ctx = nullptr;
gpt_sampler * smpl = nullptr;
g_model = &model; g_model = &model;
g_ctx = &ctx; g_ctx = &ctx;
g_smpl = &smpl;
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__); LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params); llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx); LOG_DBG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
__func__, n_ctx_train, n_ctx);
} }
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
} }
const bool add_bos = llama_should_add_bos_token(model); const bool add_bos = llama_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1); GGML_ASSERT(!llama_add_eos_token(model));
LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end; std::vector<llama_token> embd_end;
@ -230,18 +223,19 @@ int main(int argc, char ** argv) {
embd_inp.push_back(middle_token); embd_inp.push_back(middle_token);
} }
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); LOG_DBG("add_bos: %d\n", add_bos);
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} }
if ((int) embd_inp.size() > n_ctx - 4) { if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; return 1;
} }
@ -250,9 +244,8 @@ int main(int argc, char ** argv) {
params.n_keep = (int)embd_inp.size(); params.n_keep = (int)embd_inp.size();
} }
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
// enable interactive mode if interactive start is specified // enable interactive mode if interactive start is specified
if (params.interactive_first) { if (params.interactive_first) {
@ -260,21 +253,21 @@ int main(int argc, char ** argv) {
} }
if (params.verbose_prompt) { if (params.verbose_prompt) {
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
if (params.n_keep > 0) { if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__); LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
LOG_TEE("'\n"); LOG_CNT("'\n");
} }
LOG_TEE("\n"); LOG_INF("\n");
} }
if (params.interactive) { if (params.interactive) {
@ -291,30 +284,30 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif #endif
LOG_TEE("%s: interactive mode on.\n", __func__); LOG_INF("%s: interactive mode on.\n", __func__);
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n"); LOG_INF("Input prefix with BOS\n");
} }
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
} }
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
} }
} }
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); smpl = gpt_sampler_init(model, sparams);
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
LOG_TEE("\n##### Infill mode #####\n\n"); LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
if (params.infill) { LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
printf("\n************\n"); LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
printf("no need to specify '--infill', always running infill\n");
printf("************\n\n"); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
}
LOG_INF("\n");
LOG_INF("\n##### Infill mode #####\n\n");
if (params.interactive) { if (params.interactive) {
const char *control_message; const char *control_message;
if (params.multiline_input) { if (params.multiline_input) {
@ -325,11 +318,11 @@ int main(int argc, char ** argv) {
" - To return control without starting a new line, end your input with '/'.\n" " - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n"; " - If you want to submit another line, end your input with '\\'.\n";
} }
LOG_TEE("== Running in interactive mode. ==\n"); LOG_INF("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); LOG_INF( " - Press Ctrl+C to interject at any time.\n");
#endif #endif
LOG_TEE( "%s\n", control_message); LOG_INF( "%s\n", control_message);
is_interacting = params.interactive_first; is_interacting = params.interactive_first;
} }
@ -349,8 +342,6 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd; std::vector<llama_token> embd;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while (n_remain != 0 || params.interactive) { while (n_remain != 0 || params.interactive) {
// predict // predict
if (!embd.empty()) { if (!embd.empty()) {
@ -364,9 +355,8 @@ int main(int argc, char ** argv) {
embd.resize(max_embd_size); embd.resize(max_embd_size);
console::set_display(console::error); console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset); console::set_display(console::reset);
fflush(stdout);
} }
// infinite text generation via context swapping // infinite text generation via context swapping
@ -375,14 +365,14 @@ int main(int argc, char ** argv) {
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) { if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) { if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break; break;
} }
const int n_left = n_past - params.n_keep - 1; const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2; const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard); n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
@ -390,9 +380,9 @@ int main(int argc, char ** argv) {
n_past -= n_discard; n_past -= n_discard;
LOG("after swap: n_past = %d\n", n_past); LOG_DBG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
} }
@ -404,16 +394,16 @@ int main(int argc, char ** argv) {
n_eval = params.n_batch; n_eval = params.n_batch;
} }
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return 1; return 1;
} }
n_past += n_eval; n_past += n_eval;
LOG("n_past = %d\n", n_past); LOG_DBG("n_past = %d\n", n_past);
} }
} }
@ -421,11 +411,11 @@ int main(int argc, char ** argv) {
embd.clear(); embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr); const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
llama_sampling_accept(ctx_sampling, ctx, id, true); gpt_sampler_accept(smpl, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id); embd.push_back(id);
@ -435,16 +425,16 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget // decrement remaining sampling budget
--n_remain; --n_remain;
LOG("n_remain: %d\n", n_remain); LOG_DBG("n_remain: %d\n", n_remain);
} else { } else {
// some user input remains from prompt or interaction, forward it to processing // some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) { while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]); embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later // push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules // for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
@ -457,7 +447,7 @@ int main(int argc, char ** argv) {
if (input_echo) { if (input_echo) {
for (auto id : embd) { for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
if (embd.size() > 1) { if (embd.size() > 1) {
input_tokens.push_back(id); input_tokens.push_back(id);
@ -466,7 +456,6 @@ int main(int argc, char ** argv) {
output_ss << token_str; output_ss << token_str;
} }
} }
fflush(stdout);
} }
// reset color to default if we there is no pending user input // reset color to default if we there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) { if (input_echo && (int) embd_inp.size() == n_consumed) {
@ -476,13 +465,12 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs; // if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode // deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) { if (is_interacting && !params.interactive_first) {
// print an eot token // print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
} }
fflush(stdout); LOG("\n");
printf("\n");
console::set_display(console::user_input); console::set_display(console::user_input);
std::string buffer; std::string buffer;
std::string line; std::string line;
@ -538,35 +526,33 @@ int main(int argc, char ** argv) {
n_remain = params.n_predict; n_remain = params.n_predict;
n_past = 0; n_past = 0;
n_consumed = 0; n_consumed = 0;
// LOG_TEE("took new input\n");
is_interacting = false; is_interacting = false;
} }
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
LOG("found EOS token\n"); LOG_DBG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
is_interacting = true; is_interacting = true;
printf("\n"); LOG("\n");
console::set_display(console::user_input); console::set_display(console::user_input);
fflush(stdout);
} }
} }
if (n_past > 0 && is_interacting && !params.interactive) { if (n_past > 0 && is_interacting && !params.interactive) {
LOG("waiting for user input\n"); LOG_DBG("waiting for user input\n");
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n"); LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_token_bos(model));
} }
std::string buffer; std::string buffer;
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix; buffer += params.input_prefix;
printf("%s", buffer.c_str()); LOG("%s", buffer.c_str());
} }
std::string line; std::string line;
@ -584,17 +570,17 @@ int main(int argc, char ** argv) {
if (buffer.length() > 1) { if (buffer.length() > 1) {
// append input suffix if any // append input suffix if any
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix; buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str()); LOG("%s", params.input_suffix.c_str());
} }
LOG("buffer: '%s'\n", buffer.c_str()); LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size(); const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false); const auto line_inp = ::llama_tokenize(ctx, buffer, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@ -605,9 +591,9 @@ int main(int argc, char ** argv) {
} }
n_remain -= line_inp.size(); n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain); LOG_DBG("n_remain: %d\n", n_remain);
} else { } else {
LOG("empty line, passing control back\n"); LOG_DBG("empty line, passing control back\n");
} }
input_echo = false; // do not echo this again input_echo = false; // do not echo this again
@ -615,7 +601,7 @@ int main(int argc, char ** argv) {
if (n_past > 0) { if (n_past > 0) {
if (is_interacting) { if (is_interacting) {
llama_sampling_reset(ctx_sampling); gpt_sampler_reset(smpl);
} }
is_interacting = false; is_interacting = false;
} }
@ -634,22 +620,18 @@ int main(int argc, char ** argv) {
} }
} }
if (!params.interactive && n_remain <= 0) { if (!params.interactive && n_remain <= 0) {
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
fflush(stdout);
} }
llama_print_timings(ctx); LOG("\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_sampling_free(ctx_sampling); gpt_sampler_free(smpl);
llama_backend_free(); llama_backend_free();
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
return 0; return 0;
} }

View file

@ -14,7 +14,8 @@ Performance testing tool for llama.cpp.
1. [Markdown](#markdown) 1. [Markdown](#markdown)
2. [CSV](#csv) 2. [CSV](#csv)
3. [JSON](#json) 3. [JSON](#json)
4. [SQL](#sql) 4. [JSONL](#jsonl)
5. [SQL](#sql)
## Syntax ## Syntax
@ -23,27 +24,34 @@ usage: ./llama-bench [options]
options: options:
-h, --help -h, --help
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf) -m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512) -p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128) -n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: 512,128) -pg <pp,tg> (default: )
-b, --batch-size <n> (default: 2048) -b, --batch-size <n> (default: 2048)
-ub, --ubatch-size <n> (default: 512) -ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16) -ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16) -ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16) -t, --threads <n> (default: 8)
-ngl, --n-gpu-layers <n> (default: 99) -C, --cpu-mask <hex,hex> (default: 0x0)
-sm, --split-mode <none|layer|row> (default: layer) --cpu-strict <0|1> (default: 0)
-mg, --main-gpu <i> (default: 0) --poll <0...100> (default: 50)
-nkvo, --no-kv-offload <0|1> (default: 0) -ngl, --n-gpu-layers <n> (default: 99)
-fa, --flash-attn <0|1> (default: 0) -rpc, --rpc <rpc_servers> (default: )
-mmp, --mmap <0|1> (default: 1) -sm, --split-mode <none|layer|row> (default: layer)
--numa <distribute|isolate|numactl> (default: disabled) -mg, --main-gpu <i> (default: 0)
-embd, --embeddings <0|1> (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0) -fa, --flash-attn <0|1> (default: 0)
-r, --repetitions <n> (default: 5) -mmp, --mmap <0|1> (default: 1)
-o, --output <csv|json|md|sql> (default: md) --numa <distribute|isolate|numactl> (default: disabled)
-v, --verbose (default: 0) -embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
--prio <0|1|2|3> (default: 0)
--delay <0...N> (seconds) (default: 0)
-o, --output <csv|json|jsonl|md|sql> (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
``` ```
@ -238,6 +246,19 @@ $ ./llama-bench -o json
] ]
``` ```
### JSONL
```sh
$ ./llama-bench -o jsonl
```
```json lines
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
```
### SQL ### SQL
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database. SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.

View file

@ -16,6 +16,7 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <thread>
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
@ -123,6 +124,9 @@ static std::string get_cpu_info() {
(LPBYTE)cpu_brand, (LPBYTE)cpu_brand,
&cpu_brand_size) == ERROR_SUCCESS) { &cpu_brand_size) == ERROR_SUCCESS) {
id.assign(cpu_brand, cpu_brand_size); id.assign(cpu_brand, cpu_brand_size);
if (id.find('\0') != std::string::npos) {
id.resize(id.find('\0'));
}
} }
RegCloseKey(hKey); RegCloseKey(hKey);
#endif #endif
@ -170,13 +174,14 @@ static std::string get_gpu_info() {
} }
// command line params // command line params
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL}; enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) { static const char * output_format_str(output_formats format) {
switch (format) { switch (format) {
case NONE: return "none"; case NONE: return "none";
case CSV: return "csv"; case CSV: return "csv";
case JSON: return "json"; case JSON: return "json";
case JSONL: return "jsonl";
case MARKDOWN: return "md"; case MARKDOWN: return "md";
case SQL: return "sql"; case SQL: return "sql";
default: GGML_ABORT("invalid output format"); default: GGML_ABORT("invalid output format");
@ -190,6 +195,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
format = CSV; format = CSV;
} else if (s == "json") { } else if (s == "json") {
format = JSON; format = JSON;
} else if (s == "jsonl") {
format = JSONL;
} else if (s == "md") { } else if (s == "md") {
format = MARKDOWN; format = MARKDOWN;
} else if (s == "sql") { } else if (s == "sql") {
@ -225,6 +232,9 @@ struct cmd_params {
std::vector<ggml_type> type_k; std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v; std::vector<ggml_type> type_v;
std::vector<int> n_threads; std::vector<int> n_threads;
std::vector<std::string> cpu_mask;
std::vector<bool> cpu_strict;
std::vector<int> poll;
std::vector<int> n_gpu_layers; std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers; std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode; std::vector<llama_split_mode> split_mode;
@ -236,7 +246,10 @@ struct cmd_params {
std::vector<bool> embeddings; std::vector<bool> embeddings;
ggml_numa_strategy numa; ggml_numa_strategy numa;
int reps; int reps;
ggml_sched_priority prio;
int delay;
bool verbose; bool verbose;
bool progress;
output_formats output_format; output_formats output_format;
output_formats output_format_stderr; output_formats output_format_stderr;
}; };
@ -251,6 +264,9 @@ static const cmd_params cmd_params_defaults = {
/* type_k */ {GGML_TYPE_F16}, /* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()}, /* n_threads */ {cpu_get_num_math()},
/* cpu_mask */ {"0x0"},
/* cpu_strict */ {false},
/* poll */ {50},
/* n_gpu_layers */ {99}, /* n_gpu_layers */ {99},
/* rpc_servers */ {""}, /* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@ -262,7 +278,10 @@ static const cmd_params cmd_params_defaults = {
/* embeddings */ {false}, /* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED, /* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5, /* reps */ 5,
/* prio */ GGML_SCHED_PRIO_NORMAL,
/* delay */ 0,
/* verbose */ false, /* verbose */ false,
/* progress */ false,
/* output_format */ MARKDOWN, /* output_format */ MARKDOWN,
/* output_format_stderr */ NONE, /* output_format_stderr */ NONE,
}; };
@ -272,29 +291,37 @@ static void print_usage(int /* argc */, char ** argv) {
printf("\n"); printf("\n");
printf("options:\n"); printf("options:\n");
printf(" -h, --help\n"); printf(" -h, --help\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); #ifdef GGML_USE_RPC
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); #endif
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n"); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
printf("\n"); printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
} }
@ -338,6 +365,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps; params.reps = cmd_params_defaults.reps;
params.numa = cmd_params_defaults.numa; params.numa = cmd_params_defaults.numa;
params.prio = cmd_params_defaults.prio;
params.delay = cmd_params_defaults.delay;
params.progress = cmd_params_defaults.progress;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
arg = argv[i]; arg = argv[i];
@ -409,6 +439,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
types.push_back(gt); types.push_back(gt);
} }
if (invalid_param) {
break;
}
params.type_k.insert(params.type_k.end(), types.begin(), types.end()); params.type_k.insert(params.type_k.end(), types.begin(), types.end());
} else if (arg == "-ctv" || arg == "--cache-type-v") { } else if (arg == "-ctv" || arg == "--cache-type-v") {
if (++i >= argc) { if (++i >= argc) {
@ -425,6 +458,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
types.push_back(gt); types.push_back(gt);
} }
if (invalid_param) {
break;
}
params.type_v.insert(params.type_v.end(), types.begin(), types.end()); params.type_v.insert(params.type_v.end(), types.begin(), types.end());
} else if (arg == "-t" || arg == "--threads") { } else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) { if (++i >= argc) {
@ -433,6 +469,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = string_split<int>(argv[i], split_delim); auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
} else if (arg == "--cpu-strict") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
} else if (arg == "--poll") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.poll.insert(params.poll.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") { } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -440,12 +497,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = string_split<int>(argv[i], split_delim); auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
#ifdef GGML_USE_RPC
} else if (arg == "-rpc" || arg == "--rpc") { } else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.rpc_servers.push_back(argv[i]); params.rpc_servers.push_back(argv[i]);
#endif
} else if (arg == "-sm" || arg == "--split-mode") { } else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -467,6 +526,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
modes.push_back(mode); modes.push_back(mode);
} }
if (invalid_param) {
break;
}
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
} else if (arg == "-mg" || arg == "--main-gpu") { } else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) { if (++i >= argc) {
@ -541,6 +603,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break; break;
} }
params.reps = std::stoi(argv[i]); params.reps = std::stoi(argv[i]);
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
} else if (arg == "--delay") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.delay = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") { } else if (arg == "-o" || arg == "--output") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -555,6 +629,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") { } else if (arg == "-v" || arg == "--verbose") {
params.verbose = true; params.verbose = true;
} else if (arg == "--progress") {
params.progress = true;
} else { } else {
invalid_param = true; invalid_param = true;
break; break;
@ -585,6 +661,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
return params; return params;
} }
@ -598,6 +677,9 @@ struct cmd_params_instance {
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
int n_threads; int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
int n_gpu_layers; int n_gpu_layers;
std::string rpc_servers; std::string rpc_servers;
llama_split_mode split_mode; llama_split_mode split_mode;
@ -667,7 +749,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload) for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn) for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) { for (const auto & nt : params.n_threads)
for (const auto & cm : params.cpu_mask)
for (const auto & cs : params.cpu_strict)
for (const auto & pl : params.poll) {
for (const auto & n_prompt : params.n_prompt) { for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) { if (n_prompt == 0) {
continue; continue;
@ -681,6 +766,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -707,6 +795,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -733,6 +824,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -769,6 +863,9 @@ struct test {
int n_batch; int n_batch;
int n_ubatch; int n_ubatch;
int n_threads; int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
bool has_rpc; bool has_rpc;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
@ -795,6 +892,9 @@ struct test {
n_batch = inst.n_batch; n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch; n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads; n_threads = inst.n_threads;
cpu_mask = inst.cpu_mask;
cpu_strict = inst.cpu_strict;
poll = inst.poll;
has_rpc = !inst.rpc_servers.empty(); has_rpc = !inst.rpc_servers.empty();
type_k = inst.type_k; type_k = inst.type_k;
type_v = inst.type_v; type_v = inst.type_v;
@ -872,13 +972,14 @@ struct test {
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch", "n_batch", "n_ubatch",
"n_threads", "type_k", "type_v", "n_threads", "cpu_mask", "cpu_strict", "poll",
"type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings", "tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
"avg_ts", "stddev_ts" "avg_ts", "stddev_ts",
}; };
return fields; return fields;
} }
@ -887,7 +988,7 @@ struct test {
static field_type get_field_type(const std::string & field) { static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
field == "n_threads" || field == "n_threads" || field == "poll" ||
field == "model_size" || field == "model_n_params" || field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" || field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" || field == "n_prompt" || field == "n_gen" ||
@ -896,6 +997,7 @@ struct test {
} }
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "cpu_strict" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") { field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
} }
@ -928,7 +1030,8 @@ struct test {
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@ -996,38 +1099,39 @@ struct csv_printer : public printer {
} }
}; };
static std::string escape_json(const std::string & value) {
std::string escaped;
for (auto c : value) {
if (c == '"') {
escaped += "\\\"";
} else if (c == '\\') {
escaped += "\\\\";
} else if (c <= 0x1f) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", c);
escaped += buf;
} else {
escaped += c;
}
}
return escaped;
}
static std::string format_json_value(const std::string & field, const std::string & value) {
switch (test::get_field_type(field)) {
case test::STRING:
return "\"" + escape_json(value) + "\"";
case test::BOOL:
return value == "0" ? "false" : "true";
default:
return value;
}
}
struct json_printer : public printer { struct json_printer : public printer {
bool first = true; bool first = true;
static std::string escape_json(const std::string & value) {
std::string escaped;
for (auto c : value) {
if (c == '"') {
escaped += "\\\"";
} else if (c == '\\') {
escaped += "\\\\";
} else if (c <= 0x1f) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", c);
escaped += buf;
} else {
escaped += c;
}
}
return escaped;
}
static std::string format_value(const std::string & field, const std::string & value) {
switch (test::get_field_type(field)) {
case test::STRING:
return "\"" + escape_json(value) + "\"";
case test::BOOL:
return value == "0" ? "false" : "true";
default:
return value;
}
}
void print_header(const cmd_params & params) override { void print_header(const cmd_params & params) override {
fprintf(fout, "[\n"); fprintf(fout, "[\n");
(void) params; (void) params;
@ -1036,7 +1140,7 @@ struct json_printer : public printer {
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) { void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size()); assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) { for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str()); fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
} }
} }
@ -1059,6 +1163,25 @@ struct json_printer : public printer {
} }
}; };
struct jsonl_printer : public printer {
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
}
}
void print_test(const test & t) override {
fprintf(fout, "{");
print_fields(test::get_fields(), t.get_values());
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
fprintf(fout, "}\n");
fflush(fout);
}
};
struct markdown_printer : public printer { struct markdown_printer : public printer {
std::vector<std::string> fields; std::vector<std::string> fields;
@ -1067,7 +1190,7 @@ struct markdown_printer : public printer {
return -30; return -30;
} }
if (field == "t/s") { if (field == "t/s") {
return 16; return 20;
} }
if (field == "size" || field == "params") { if (field == "size" || field == "params") {
return 10; return 10;
@ -1149,6 +1272,15 @@ struct markdown_printer : public printer {
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.emplace_back("n_threads"); fields.emplace_back("n_threads");
} }
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
fields.emplace_back("cpu_mask");
}
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
fields.emplace_back("cpu_strict");
}
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
fields.emplace_back("poll");
}
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch"); fields.emplace_back("n_batch");
} }
@ -1350,6 +1482,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
return std::unique_ptr<printer>(new csv_printer()); return std::unique_ptr<printer>(new csv_printer());
case JSON: case JSON:
return std::unique_ptr<printer>(new json_printer()); return std::unique_ptr<printer>(new json_printer());
case JSONL:
return std::unique_ptr<printer>(new jsonl_printer());
case MARKDOWN: case MARKDOWN:
return std::unique_ptr<printer>(new markdown_printer()); return std::unique_ptr<printer>(new markdown_printer());
case SQL: case SQL:
@ -1383,6 +1517,8 @@ int main(int argc, char ** argv) {
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
set_process_priority(params.prio);
// initialize printer // initialize printer
std::unique_ptr<printer> p = create_printer(params.output_format); std::unique_ptr<printer> p = create_printer(params.output_format);
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr); std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@ -1402,7 +1538,13 @@ int main(int argc, char ** argv) {
llama_model * lmodel = nullptr; llama_model * lmodel = nullptr;
const cmd_params_instance * prev_inst = nullptr; const cmd_params_instance * prev_inst = nullptr;
int params_idx = 0;
auto params_count = params_instances.size();
for (const auto & inst : params_instances) { for (const auto & inst : params_instances) {
params_idx ++;
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
}
// keep the same model between tests when possible // keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
if (lmodel) { if (lmodel) {
@ -1428,12 +1570,40 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
// cool off before the test
if (params.delay) {
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
}
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
exit(1);
}
tpp.strict_cpu = t.cpu_strict;
tpp.poll = t.poll;
tpp.prio = params.prio;
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool, NULL);
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
}
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
}
test_gen(ctx, 1, 0, t.n_threads); test_gen(ctx, 1, 0, t.n_threads);
} }
@ -1443,9 +1613,15 @@ int main(int argc, char ** argv) {
uint64_t t_start = get_time_ns(); uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
}
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
}
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
} }
@ -1463,9 +1639,11 @@ int main(int argc, char ** argv) {
fflush(p_err->fout); fflush(p_err->fout);
} }
llama_print_timings(ctx); llama_perf_context_print(ctx);
llama_free(ctx); llama_free(ctx);
ggml_threadpool_free(threadpool);
} }
llama_free_model(lmodel); llama_free_model(lmodel);

View file

@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
LOGi("Using %d threads", n_threads); LOGi("Using %d threads", n_threads);
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = 2048; ctx_params.n_ctx = 2048;
ctx_params.n_threads = n_threads; ctx_params.n_threads = n_threads;
ctx_params.n_threads_batch = n_threads; ctx_params.n_threads_batch = n_threads;
@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
return env->NewStringUTF(result.str().c_str()); return env->NewStringUTF(result.str().c_str());
} }
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C" extern "C"
JNIEXPORT jlong JNICALL JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
return reinterpret_cast<jlong>(batch); return reinterpret_cast<jlong>(batch);
} }
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = true;
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
return reinterpret_cast<jlong>(smpl);
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
}
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) { Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
@ -381,31 +398,21 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
jobject, jobject,
jlong context_pointer, jlong context_pointer,
jlong batch_pointer, jlong batch_pointer,
jlong sampler_pointer,
jint n_len, jint n_len,
jobject intvar_ncur jobject intvar_ncur
) { ) {
const auto context = reinterpret_cast<llama_context *>(context_pointer); const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer); const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
const auto model = llama_get_model(context); const auto model = llama_get_model(context);
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
auto n_vocab = llama_n_vocab(model);
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// sample the most likely token // sample the most likely token
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); const auto new_token_id = llama_sampler_sample(sampler, context, -1);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {

View file

@ -45,8 +45,10 @@ class LLamaAndroid {
private external fun free_context(context: Long) private external fun free_context(context: Long)
private external fun backend_init(numa: Boolean) private external fun backend_init(numa: Boolean)
private external fun backend_free() private external fun backend_free()
private external fun free_batch(batch: Long)
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
private external fun free_batch(batch: Long)
private external fun new_sampler(): Long
private external fun free_sampler(sampler: Long)
private external fun bench_model( private external fun bench_model(
context: Long, context: Long,
model: Long, model: Long,
@ -69,6 +71,7 @@ class LLamaAndroid {
private external fun completion_loop( private external fun completion_loop(
context: Long, context: Long,
batch: Long, batch: Long,
sampler: Long,
nLen: Int, nLen: Int,
ncur: IntVar ncur: IntVar
): String? ): String?
@ -101,8 +104,11 @@ class LLamaAndroid {
val batch = new_batch(512, 0, 1) val batch = new_batch(512, 0, 1)
if (batch == 0L) throw IllegalStateException("new_batch() failed") if (batch == 0L) throw IllegalStateException("new_batch() failed")
val sampler = new_sampler()
if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
Log.i(tag, "Loaded model $pathToModel") Log.i(tag, "Loaded model $pathToModel")
threadLocalState.set(State.Loaded(model, context, batch)) threadLocalState.set(State.Loaded(model, context, batch, sampler))
} }
else -> throw IllegalStateException("Model already loaded") else -> throw IllegalStateException("Model already loaded")
} }
@ -114,7 +120,7 @@ class LLamaAndroid {
is State.Loaded -> { is State.Loaded -> {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) { while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, nlen, ncur) val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
if (str == null) { if (str == null) {
break break
} }
@ -138,6 +144,7 @@ class LLamaAndroid {
free_context(state.context) free_context(state.context)
free_model(state.model) free_model(state.model)
free_batch(state.batch) free_batch(state.batch)
free_sampler(state.sampler);
threadLocalState.set(State.Idle) threadLocalState.set(State.Idle)
} }
@ -161,7 +168,7 @@ class LLamaAndroid {
private sealed interface State { private sealed interface State {
data object Idle: State data object Idle: State
data class Loaded(val model: Long, val context: Long, val batch: Long): State data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
} }
// Enforce only one instance of Llm. // Enforce only one instance of Llm.

View file

@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
actor LlamaContext { actor LlamaContext {
private var model: OpaquePointer private var model: OpaquePointer
private var context: OpaquePointer private var context: OpaquePointer
private var sampling: UnsafeMutablePointer<llama_sampler>
private var batch: llama_batch private var batch: llama_batch
private var tokens_list: [llama_token] private var tokens_list: [llama_token]
var is_done: Bool = false var is_done: Bool = false
@ -42,9 +43,15 @@ actor LlamaContext {
self.tokens_list = [] self.tokens_list = []
self.batch = llama_batch_init(512, 0, 1) self.batch = llama_batch_init(512, 0, 1)
self.temporary_invalid_cchars = [] self.temporary_invalid_cchars = []
let sparams = llama_sampler_chain_default_params()
self.sampling = llama_sampler_chain_init(sparams)
llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
} }
deinit { deinit {
llama_sampler_free(sampling)
llama_batch_free(batch) llama_batch_free(batch)
llama_free(context) llama_free(context)
llama_free_model(model) llama_free_model(model)
@ -69,10 +76,9 @@ actor LlamaContext {
print("Using \(n_threads) threads") print("Using \(n_threads) threads")
var ctx_params = llama_context_default_params() var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 2048 ctx_params.n_ctx = 2048
ctx_params.n_threads = UInt32(n_threads) ctx_params.n_threads = Int32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads) ctx_params.n_threads_batch = Int32(n_threads)
let context = llama_new_context_with_model(model, ctx_params) let context = llama_new_context_with_model(model, ctx_params)
guard let context else { guard let context else {
@ -144,20 +150,7 @@ actor LlamaContext {
func completion_loop() -> String { func completion_loop() -> String {
var new_token_id: llama_token = 0 var new_token_id: llama_token = 0
let n_vocab = llama_n_vocab(model) new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
var candidates = Array<llama_token_data>()
candidates.reserveCapacity(Int(n_vocab))
for token_id in 0..<n_vocab {
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
}
candidates.withUnsafeMutableBufferPointer() { buffer in
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
new_token_id = llama_sample_token_greedy(context, &candidates_p)
}
if llama_token_is_eog(model, new_token_id) || n_cur == n_len { if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n") print("\n")

View file

@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
```sh ```sh
python ./examples/llava/convert_image_encoder_to_gguf \ python ./examples/llava/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \ -m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \ --llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \ --output-dir path/to/MobileVLM-1.7B \
@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
``` ```
```sh ```sh
python ./examples/llava/convert_image_encoder_to_gguf \ python ./examples/llava/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \ -m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
--output-dir path/to/MobileVLM-1.7B_V2 \ --output-dir path/to/MobileVLM-1.7B_V2 \
@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh ```sh
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
``` ```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` 5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
```sh ```sh
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s ./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
``` ```
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.

View file

@ -15,9 +15,9 @@ cd llama.cpp
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash ```bash
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version # quantize int4 version
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M ./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M

View file

@ -0,0 +1,107 @@
## MiniCPM-V 2.6
### Prepare models and code
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
Clone llama.cpp:
```bash
git clone git@github.com:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpmv-main
```
### Usage of MiniCPM-V 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version
./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build for Linux or Mac
```bash
make
make llama-minicpmv-cli
```
Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
### Video
Install FFmpeg
```
brew install ffmpeg
brew install pkg-config
```
### Android
#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```

View file

@ -3,7 +3,6 @@
// I'll gradually clean and extend it // I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "log.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
@ -20,6 +19,10 @@
#include "ggml-cann.h" #include "ggml-cann.h"
#endif #endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h" #include "stb_image.h"
@ -36,6 +39,11 @@
#include <cinttypes> #include <cinttypes>
#include <limits> #include <limits>
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image // RGB uint8 image
@ -81,6 +89,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_GELU "clip.use_gelu"
#define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length" #define KEY_N_FF "clip.%s.feed_forward_length"
@ -160,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
static int get_key_idx(const gguf_context * ctx, const char * key) { static int get_key_idx(const gguf_context * ctx, const char * key) {
int i = gguf_find_key(ctx, key); int i = gguf_find_key(ctx, key);
if (i == -1) { if (i == -1) {
LOG_TEE("key %s not found in file\n", key); LOG_ERR("key %s not found in file\n", key);
throw std::runtime_error(format("Missing required key: %s", key)); throw std::runtime_error(format("Missing required key: %s", key));
} }
@ -211,13 +220,19 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
static void replace_all(std::string & s, const std::string & search, const std::string & replace) { static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) { if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string return;
} }
std::string builder;
builder.reserve(s.length());
size_t pos = 0; size_t pos = 0;
while ((pos = s.find(search, pos)) != std::string::npos) { size_t last_pos = 0;
s.replace(pos, search.length(), replace); while ((pos = s.find(search, last_pos)) != std::string::npos) {
pos += replace.length(); builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
} }
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
} }
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@ -259,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
size_t tensor_size = ggml_nbytes(tensor); size_t tensor_size = ggml_nbytes(tensor);
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size, prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
} }
@ -277,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return; return;
} }
@ -296,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return; return;
} }
@ -526,6 +541,7 @@ struct clip_ctx {
bool has_vision_encoder = false; bool has_vision_encoder = false;
bool has_llava_projector = false; bool has_llava_projector = false;
bool has_minicpmv_projector = false; bool has_minicpmv_projector = false;
int minicpmv_version = 2;
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP; projector_type proj_type = PROJECTOR_TYPE_MLP;
@ -556,7 +572,7 @@ struct clip_ctx {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return nullptr; return nullptr;
} }
@ -570,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (load_image_size == nullptr) { if (load_image_size == nullptr) {
load_image_size = clip_image_size_init(); load_image_size = clip_image_size_init();
} }
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
image_size_width = load_image_size->width; image_size_width = load_image_size->width;
image_size_height = load_image_size->height; image_size_height = load_image_size->height;
if (is_inf) { if (is_inf) {
@ -641,7 +657,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (ctx->has_minicpmv_projector) { if (ctx->has_minicpmv_projector) {
int pos_w = image_size_width/patch_size; int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size; int pos_h = image_size_height/patch_size;
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); if (ctx->minicpmv_version == 2) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 3) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
ggml_set_name(pos_embed, "pos_embed"); ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed); ggml_set_input(pos_embed);
} }
@ -951,10 +972,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
{ // attention { // attention
const int hidden_size = 4096; int hidden_size = 4096;
const int d_head = 128; const int d_head = 128;
const int n_head = hidden_size/d_head; int n_head = hidden_size/d_head;
const int num_query = 96; int num_query = 96;
if (ctx->minicpmv_version == 2) {
hidden_size = 4096;
n_head = hidden_size/d_head;
num_query = 96;
}
else if (ctx->minicpmv_version == 3) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@ -1022,21 +1053,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const int idx_name = gguf_find_key(ctx, KEY_NAME); const int idx_name = gguf_find_key(ctx, KEY_NAME);
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name); const std::string name = gguf_get_val_str(ctx, idx_name);
LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); LOG_INF("%s: model name: %s\n", __func__, name.c_str());
} }
LOG_TEE("%s: description: %s\n", __func__, description.c_str()); LOG_INF("%s: description: %s\n", __func__, description.c_str());
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
LOG_TEE("\n"); LOG_INF("\n");
} }
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
// kv // kv
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname); __func__, n_kv, n_tensors, fname);
{ {
std::map<enum ggml_type, uint32_t> n_type; std::map<enum ggml_type, uint32_t> n_type;
@ -1047,7 +1078,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
n_type[type]++; n_type[type]++;
} }
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) { for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i); const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i);
@ -1063,7 +1094,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
replace_all(value, "\n", "\\n"); replace_all(value, "\n", "\\n");
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
} }
// print type counts // print type counts
@ -1072,7 +1103,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
continue; continue;
} }
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
} }
} }
@ -1087,13 +1118,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
size_t tensor_size = ggml_nbytes(cur); size_t tensor_size = ggml_nbytes(cur);
model_size += tensor_size; model_size += tensor_size;
if (verbosity >= 3) { if (verbosity >= 3) {
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
} }
} }
} }
clip_ctx * new_clip = new clip_ctx; clip_ctx * new_clip = new clip_ctx{};
// update projector type // update projector type
{ {
@ -1114,23 +1145,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
LOG_TEE("%s: CLIP using CUDA backend\n", __func__); LOG_INF("%s: CLIP using CUDA backend\n", __func__);
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
new_clip->backend = ggml_backend_metal_init(); new_clip->backend = ggml_backend_metal_init();
LOG_TEE("%s: CLIP using Metal backend\n", __func__); LOG_INF("%s: CLIP using Metal backend\n", __func__);
#endif #endif
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
new_clip->backend = ggml_backend_cann_init(0); new_clip->backend = ggml_backend_cann_init(0);
LOG_TEE("%s: CLIP using CANN backend\n", __func__); LOG_INF("%s: CLIP using CANN backend\n", __func__);
#endif #endif
#ifdef GGML_USE_VULKAN
new_clip->backend = ggml_backend_vk_init(0);
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
#endif
if (!new_clip->backend) { if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init(); new_clip->backend = ggml_backend_cpu_init();
LOG_TEE("%s: CLIP using CPU backend\n", __func__); LOG_INF("%s: CLIP using CPU backend\n", __func__);
} }
// model size and capabilities // model size and capabilities
@ -1151,6 +1186,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
} }
idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
if (idx != -1) {
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
}
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(new_clip->has_vision_encoder);
@ -1160,16 +1200,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->use_gelu = gguf_get_val_bool(ctx, idx); new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
if (verbosity >= 1) { if (verbosity >= 1) {
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
} }
} }
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
// load tensors // load tensors
{ {
@ -1182,7 +1222,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->ctx_data = ggml_init(params); new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) { if (!new_clip->ctx_data) {
LOG_TEE("%s: ggml_init() failed\n", __func__); LOG_ERR("%s: ggml_init() failed\n", __func__);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1190,7 +1230,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
auto fin = std::ifstream(fname, std::ios::binary); auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) { if (!fin) {
LOG_TEE("cannot open model file for loading tensors\n"); LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1212,7 +1252,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg); fin.seekg(offset, std::ios::beg);
if (!fin) { if (!fin) {
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1283,23 +1323,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
if (verbosity >= 2) { if (verbosity >= 2) {
LOG_TEE("\n%s: vision model hparams\n", __func__); LOG_INF("\n%s: vision model hparams\n", __func__);
LOG_TEE("image_size %d\n", hparams.image_size); LOG_INF("image_size %d\n", hparams.image_size);
LOG_TEE("patch_size %d\n", hparams.patch_size); LOG_INF("patch_size %d\n", hparams.patch_size);
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
LOG_TEE("v_n_head %d\n", hparams.n_head); LOG_INF("v_n_head %d\n", hparams.n_head);
LOG_TEE("v_n_layer %d\n", hparams.n_layer); LOG_INF("v_n_layer %d\n", hparams.n_layer);
LOG_TEE("v_eps %f\n", hparams.eps); LOG_INF("v_eps %f\n", hparams.eps);
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
LOG_TEE("v_image_grid_pinpoints: "); LOG_INF("v_image_grid_pinpoints: ");
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
} }
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
} }
@ -1337,7 +1377,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) { } catch(const std::exception& /*e*/) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__); LOG_ERR("%s: failed to load vision model tensors\n", __func__);
} }
// LLaVA projection // LLaVA projection
@ -1366,7 +1406,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} catch (std::runtime_error & /*e*/) { } } catch (std::runtime_error & /*e*/) { }
try { try {
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch (std::runtime_error & /*e*/) { } } catch (std::runtime_error & /*e*/) { }
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection // MobileVLM projection
@ -1467,7 +1507,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf); ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
} }
return new_clip; return new_clip;
@ -1518,7 +1558,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc; int nx, ny, nc;
auto * data = stbi_load(fname, &nx, &ny, &nc, 3); auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
return false; return false;
} }
build_clip_img_from_data(data, nx, ny, img); build_clip_img_from_data(data, nx, ny, img);
@ -1530,7 +1570,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
int nx, ny, nc; int nx, ny, nc;
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
LOG_TEE("%s: failed to decode image bytes\n", __func__); LOG_ERR("%s: failed to decode image bytes\n", __func__);
return false; return false;
} }
build_clip_img_from_data(data, nx, ny, img); build_clip_img_from_data(data, nx, ny, img);
@ -1589,7 +1629,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
} }
} }
inline float clip(float x, float lower, float upper) { inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper)); return std::max(lower, std::min(x, upper));
} }
@ -1720,7 +1760,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution; int wasted_resolution = (width * height) - effective_resolution;
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution; max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution; min_wasted_resolution = wasted_resolution;
@ -1793,10 +1833,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
return refine_size; return refine_size;
} }
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums; std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) { for (int i : {multiple - 1, multiple, multiple + 1}) {
@ -1842,7 +1878,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
const int multiple = fmin(ceil(ratio), max_slice_nums); const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8 *>> images; std::vector<std::vector<clip_image_u8 *>> images;
LOG_TEE("%s: multiple %d\n", __func__, multiple); LOG_INF("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8 *>()); images.push_back(std::vector<clip_image_u8 *>());
if (multiple <= 1) { if (multiple <= 1) {
@ -1857,17 +1893,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
clip_image_u8 * source_image = clip_image_u8_init(); clip_image_u8 * source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second); bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image); images[images.size()-1].push_back(source_image);
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 * refine_image = clip_image_u8_init(); clip_image_u8 * refine_image = clip_image_u8_init();
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
// split_to_patches // split_to_patches
int width = refine_image->nx; int width = refine_image->nx;
@ -1912,17 +1948,19 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found // res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
if (clip_is_minicpmv(ctx)) {
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img); if(clip_is_minicpmv(ctx)){
int max_slice_nums = 9;
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
res_imgs->size = 0; res_imgs->size = 0;
for (size_t i = 0; i < imgs.size(); ++i) { for (size_t i = 0; i < imgs.size(); ++i){
res_imgs->size += imgs[i].size(); res_imgs->size += imgs[i].size();
} }
res_imgs->data = new clip_image_f32[res_imgs->size]; res_imgs->data = new clip_image_f32[res_imgs->size];
int idx = 0; int idx = 0;
for (size_t i = 0; i < imgs.size(); ++i) { for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) { for (size_t j = 0; j < imgs[i].size(); ++j) {
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
clip_image_f32 * res = clip_image_f32_init(); clip_image_f32 * res = clip_image_f32_init();
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
res_imgs->data[idx++] = *res; res_imgs->data[idx++] = *res;
@ -1934,7 +1972,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
bool pad_to_square = true; bool pad_to_square = true;
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
@ -2011,7 +2049,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
} }
for (size_t i = 0; i < patches.size(); i++) { for (size_t i = 0; i < patches.size(); i++) {
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
clip_image_u8_free(patches[i]); clip_image_u8_free(patches[i]);
} }
@ -2148,7 +2186,12 @@ int clip_n_patches(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
n_patches /= 4; n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
n_patches = 96; if (ctx->minicpmv_version == 2) {
n_patches = 96;
}
else if (ctx->minicpmv_version == 3) {
n_patches = 64;
}
} }
return n_patches; return n_patches;
@ -2242,7 +2285,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
@ -2254,7 +2297,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
@ -2284,6 +2327,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init();
}
const int pos_w = ctx->load_image_size->width/patch_size;
const int pos_h = ctx->load_image_size->height/patch_size;
{ {
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@ -2318,8 +2366,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions)); int* positions_data = (int*)malloc(ggml_nbytes(positions));
for (int i = 0; i < num_positions; i++) { int bucket_coords_h[70];
positions_data[i] = std::floor(70.0*i/num_positions); int bucket_coords_w[70];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
} }
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data); free(positions_data);
@ -2330,12 +2388,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init();
}
int pos_w = ctx->load_image_size->width/patch_size;
int pos_h = ctx->load_image_size->height/patch_size;
int embed_dim = 4096; int embed_dim = 4096;
if (ctx->minicpmv_version == 2) {
embed_dim = 4096;
}
else if (ctx->minicpmv_version == 3) {
embed_dim = 3584;
}
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@ -2348,7 +2407,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed)); ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
free(pos_embed_data); free(pos_embed_data);
} }
} else { }
else{
{ {
if (ctx->has_class_embedding) { if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
@ -2395,7 +2455,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_graph_compute(ctx->backend, gf); ggml_backend_graph_compute(ctx->backend, gf);
// the last node is the embedding tensor // the last node is the embedding tensor
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
// copy the embeddings to the location passed by the user // copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
@ -2467,7 +2527,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
new_type = type; new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
} }
const size_t n_elms = ggml_nelements(cur); const size_t n_elms = ggml_nelements(cur);
float * f32_data; float * f32_data;
@ -2486,7 +2546,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
f32_data = (float *)conv_buf.data(); f32_data = (float *)conv_buf.data();
break; break;
default: default:
LOG_TEE("Please use an input file in f32 or f16\n"); LOG_ERR("Please use an input file in f32 or f16\n");
gguf_free(ctx_out); gguf_free(ctx_out);
return false; return false;
} }
@ -2513,7 +2573,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
fout.put(0); fout.put(0);
} }
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
} }
@ -2529,8 +2589,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
gguf_free(ctx_out); gguf_free(ctx_out);
{ {
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
} }
return true; return true;
@ -2554,13 +2614,21 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->vision_model.mm_3_b->ne[0]; return ctx->vision_model.mm_3_b->ne[0];
} }
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
return 4096; if (ctx->minicpmv_version == 2) {
return 4096;
}
else if (ctx->minicpmv_version == 3) {
return 3584;
}
} }
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
} }
bool clip_is_minicpmv(const struct clip_ctx * ctx) { int clip_is_minicpmv(const struct clip_ctx * ctx) {
return ctx->has_minicpmv_projector; if (ctx->has_minicpmv_projector) {
return ctx->minicpmv_version;
}
return 0;
} }

View file

@ -85,7 +85,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -1,14 +1,16 @@
#include "ggml.h" #include "arg.h"
#include "base64.hpp"
#include "log.h" #include "log.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include "base64.hpp"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <vector> #include <vector>
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) { static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@ -19,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch; n_eval = n_batch;
} }
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -40,11 +42,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
return true; return true;
} }
static const char * sample(struct llama_sampling_context * ctx_sampling, static const char * sample(struct gpt_sampler * smpl,
struct llama_context * ctx_llama, struct llama_context * ctx_llama,
int * n_past) { int * n_past) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
llama_sampling_accept(ctx_sampling, ctx_llama, id, true); gpt_sampler_accept(smpl, id, true);
static std::string ret; static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>"; ret = "</s>";
@ -74,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
size_t img_base64_str_start, img_base64_str_end; size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL; return NULL;
} }
@ -88,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) { if (!embed) {
LOG_TEE("%s: could not load image from base64 string.\n", __func__); LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL; return NULL;
} }
@ -112,12 +114,10 @@ struct llava_context {
struct llama_model * model = NULL; struct llama_model * model = NULL;
}; };
static void print_usage(int argc, char ** argv, const gpt_params & params) { static void print_usage(int, char ** argv) {
gpt_params_print_usage(argc, argv, params); LOG("\n example usage:\n");
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n example usage:\n"); LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@ -127,16 +127,16 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
auto prompt = params->prompt; auto prompt = params->prompt;
if (prompt_contains_image(prompt)) { if (prompt_contains_image(prompt)) {
if (!params->image.empty()) { if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n"); LOG_INF("using base64 encoded image instead of command line image path\n");
} }
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) { if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__); LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL; return NULL;
} }
params->prompt = remove_image_from_prompt(prompt); params->prompt = remove_image_from_prompt(prompt);
} else { } else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str()); embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) { if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL; return NULL;
@ -157,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos); system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length()); user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} else { } else {
@ -178,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} }
@ -189,21 +189,21 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// generate the response // generate the response
LOG_TEE("\n"); LOG("\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
if (!ctx_sampling) { if (!smpl) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1); exit(1);
} }
std::string response = ""; std::string response = "";
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
response += tmp; response += tmp;
if (strcmp(tmp, "</s>") == 0) break; if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp); LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@ -211,8 +211,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
fflush(stdout); fflush(stdout);
} }
llama_sampling_free(ctx_sampling); gpt_sampler_free(smpl);
printf("\n"); LOG("\n");
} }
static struct llama_model * llava_init(gpt_params * params) { static struct llama_model * llava_init(gpt_params * params) {
@ -223,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n" , __func__); LOG_ERR("%s: unable to load model\n" , __func__);
return NULL; return NULL;
} }
return model; return model;
@ -246,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL; return NULL;
} }
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip; ctx_llava->ctx_clip = ctx_clip;
@ -269,65 +269,54 @@ static void llava_free(struct llava_context * ctx_llava) {
llama_backend_free(); llama_backend_free();
} }
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_time_init();
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
#ifndef LOG_DISABLE_LOGS gpt_init();
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv, {}); print_usage(argc, argv);
return 1; return 1;
} }
auto model = llava_init(&params);
auto * model = llava_init(&params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__); fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1; return 1;
} }
if (prompt_contains_image(params.prompt)) { if (prompt_contains_image(params.prompt)) {
auto ctx_llava = llava_init_context(&params, model); auto * ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, ""); auto * image_embed = load_image(ctx_llava, &params, "");
// process the prompt // process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama); llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed); llava_image_embed_free(image_embed);
ctx_llava->model = NULL; ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);
} else { } else {
for (auto & image : params.image) { for (auto & image : params.image) {
auto ctx_llava = llava_init_context(&params, model); auto * ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image); auto * image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) { if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1; return 1;
} }
// process the prompt // process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama); llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed); llava_image_embed_free(image_embed);
ctx_llava->model = NULL; ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);

View file

@ -1,13 +1,23 @@
#include "clip.h" #include "clip.h"
#include "common.h"
#include "llama.h"
#include "llava.h" #include "llava.h"
#include "base64.hpp"
#include "llama.h"
#include <algorithm>
#include <cerrno>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <limits>
#include <vector> #include <vector>
#include <numeric>
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
// RGB uint8 image // RGB uint8 image
struct clip_image_u8 { struct clip_image_u8 {
@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution; int wasted_resolution = (width * height) - effective_resolution;
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution; max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution; min_wasted_resolution = wasted_resolution;
@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten); ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1); ggml_graph_compute_with_ctx(model.ctx, gf, 1);
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor* result = ggml_graph_node(gf, -1);
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ): // append without newline tokens (default behavior in llava_arch when not using unpad ):
@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
img_res_v.size = 0; img_res_v.size = 0;
img_res_v.data = nullptr; img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
LOG_TEE("%s: unable to preprocess image\n", __func__); LOG_ERR("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data; delete[] img_res_v.data;
return false; return false;
} }
@ -256,16 +266,23 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->width = img_res_v.data[i].nx; load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny; load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, load_image_size);
const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) { if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
} }
const int64_t t_img_enc_steop_batch_us = ggml_time_us(); const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0; int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
@ -280,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->width = img->nx; load_image_size->width = img->nx;
load_image_size->height = img->ny; load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, load_image_size);
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
} }
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding // flat / default llava-1.5 type embedding
@ -288,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data; delete[] img_res_v.data;
if (!encoded) { if (!encoded) {
LOG_TEE("Unable to encode image\n"); LOG_ERR("Unable to encode image\n");
return false; return false;
} }
@ -302,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) { if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
} }
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip); const int32_t * image_grid = clip_image_grid(ctx_clip);
@ -340,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// clip_image_save_to_bmp(*tmp, "image_feature.bmp"); // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
} }
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us(); const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true; return true;
} }
@ -355,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip); auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) { if (n_image_embd != n_llama_embd) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false; return false;
} }
return true; return true;
@ -368,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
} }
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
if (!image_embd) { if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n"); LOG_ERR("Unable to allocate memory for image embeddings\n");
return false; return false;
} }
int n_img_pos; int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
LOG_TEE("%s: cannot encode image, aborting\n", __func__); LOG_ERR("%s: cannot encode image, aborting\n", __func__);
free(image_embd); free(image_embd);
return false; return false;
} }
@ -394,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
} }
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) { if (llama_decode(ctx_llama, batch)) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -406,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
clip_image_u8 * img = clip_image_u8_init(); clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img); clip_image_u8_free(img);
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL; return NULL;
} }
@ -415,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) { if (!image_embed_result) {
clip_image_u8_free(img); clip_image_u8_free(img);
LOG_TEE("%s: coulnd't embed the image\n", __func__); LOG_ERR("%s: coulnd't embed the image\n", __func__);
return NULL; return NULL;
} }
@ -429,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb"); auto file = fopen(path, "rb");
if (file == NULL) { if (file == NULL) {
LOG_TEE("%s: can't read file %s\n", __func__, path); LOG_ERR("%s: can't read file %s\n", __func__, path);
return false; return false;
} }
@ -439,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) { if (buffer == NULL) {
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error"); perror("Memory allocation error");
fclose(file); fclose(file);
return false; return false;
@ -464,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
long image_bytes_length; long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) { if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, image_path); LOG_ERR("%s: failed to load %s\n", __func__, image_path);
return NULL; return NULL;
} }

View file

@ -1,13 +1,18 @@
#include "ggml.h" #include "arg.h"
#include "log.h" #include "log.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include <algorithm>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <vector> #include <vector>
#include <iostream> // TODO: remove me
struct llava_context { struct llava_context {
struct clip_ctx * ctx_clip = NULL; struct clip_ctx * ctx_clip = NULL;
@ -16,14 +21,8 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
} }
static struct llama_model * llava_init(gpt_params * params) { static struct llama_model * llava_init(gpt_params * params) {
@ -34,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n" , __func__); LOG_ERR("%s: unable to load model\n" , __func__);
return NULL; return NULL;
} }
return model; return model;
@ -49,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
if (params->n_ctx < 2048) { if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048" // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
ctx_params.n_ctx = 2048; ctx_params.n_ctx = 2048;
} else { } else {
ctx_params.n_ctx = params->n_ctx; ctx_params.n_ctx = params->n_ctx;
@ -58,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL; return NULL;
} }
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_llama = ctx_llama;
ctx_llava->model = model; ctx_llava->model = model;
@ -87,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
if (prompt.empty()) { if (prompt.empty()) {
prompt = "describe the image in detail."; prompt = "describe the image in detail.";
} }
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip; return ctx_clip;
} }
@ -99,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch; n_eval = n_batch;
} }
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -123,7 +122,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed; slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
@ -134,8 +133,14 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
std::string system_prompt; std::string system_prompt;
int idx = 0; int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
LOG_TEE("%s: image token past: %d\n", __func__, n_past); if (has_minicpmv_projector == 2) {
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
}
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
@ -154,14 +159,14 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
} }
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
} }
LOG_TEE("%s: image token past: %d\n", __func__, n_past); LOG_INF("%s: image token past: %d\n", __func__, n_past);
} }
static const char * sample(struct llama_sampling_context * ctx_sampling, static const char * sample(struct gpt_sampler * smpl,
struct llama_context * ctx_llama, struct llama_context * ctx_llama,
int * n_past) { int * n_past) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
llama_sampling_accept(ctx_sampling, ctx_llama, id, true); gpt_sampler_accept(smpl, id, true);
static std::string ret; static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>"; ret = "</s>";
@ -173,58 +178,72 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
} }
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params); auto * ctx_clip = clip_init_context(params);
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) { if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
return NULL; return NULL;
} }
// process the prompt // process the prompt
if (params->prompt.empty() && params->interactive == false) { if (params->prompt.empty() && params->interactive == false) {
LOG_TEE("prompt should be given or interactive mode should be on"); LOG_ERR("prompt should be given or interactive mode should be on");
return NULL; return NULL;
} }
auto model = llava_init(params); auto * model = llava_init(params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL; return NULL;
} }
const int64_t t_llava_init_start_us = ggml_time_us(); const int64_t t_llava_init_start_us = ggml_time_us();
auto ctx_llava = llava_init_context(params, model); auto * ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip; ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us(); const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us(); const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past); process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us(); const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds); llava_image_embed_free(embeds);
return ctx_llava; return ctx_llava;
} }
static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
std::string user_prompt = prompt; std::string user_prompt = prompt;
if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
if (has_minicpmv_projector == 2) {
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
}
else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt;
}
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); if (has_minicpmv_projector == 2) {
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
// generate the response // generate the response
LOG_TEE("\n"); LOG_INF("\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
return ctx_sampling; return smpl;
} }
static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
return tmp; return tmp;
} }
@ -233,41 +252,36 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
show_additional_info(argc, argv);
return 1; return 1;
} }
#ifndef LOG_DISABLE_LOGS gpt_init();
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty())) { if (params.mmproj.empty() || (params.image.empty())) {
gpt_params_print_usage(argc, argv, params);
show_additional_info(argc, argv); show_additional_info(argc, argv);
return 1; return 1;
} }
for (auto & image : params.image) { for (auto & image : params.image) {
int n_past = 0; int n_past = 0;
auto ctx_llava = minicpmv_init(&params, image, n_past); auto * ctx_llava = minicpmv_init(&params, image, n_past);
if (!params.prompt.empty()) { if (!params.prompt.empty()) {
LOG_TEE("<user>%s\n", params.prompt.c_str()); LOG("<user>%s\n", params.prompt.c_str());
LOG_TEE("<assistant>"); LOG("<assistant>");
auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true); auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response = ""; std::string response;
bool have_tmp = false; bool have_tmp = false;
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp; response += tmp;
if (strcmp(tmp, "</s>") == 0){ if (strcmp(tmp, "</s>") == 0){
if(!have_tmp)continue; if (!have_tmp) {
else break; continue;
}
break;
} }
if (strstr(tmp, "###")) break; // Yi-VL behavior if (strstr(tmp, "###")) break; // Yi-VL behavior
have_tmp = true; have_tmp = true;
@ -276,18 +290,18 @@ int main(int argc, char ** argv) {
fflush(stdout); fflush(stdout);
} }
llama_sampling_free(ctx_sampling); gpt_sampler_free(smpl);
}else { }else {
while (true) { while (true) {
LOG_TEE("<user>"); LOG("<user>");
std::string prompt; std::string prompt;
std::getline(std::cin, prompt); std::getline(std::cin, prompt);
LOG_TEE("<assistant>"); LOG("<assistant>");
auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true); auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response = ""; std::string response;
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp; response += tmp;
if (strcmp(tmp, "</s>") == 0) break; if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior if (strstr(tmp, "###")) break; // Yi-VL behavior
@ -295,11 +309,11 @@ int main(int argc, char ** argv) {
if (strstr(response.c_str(), "<user>")) break; // minicpm-v if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout); fflush(stdout);
} }
llama_sampling_free(ctx_sampling); gpt_sampler_free(smpl);
} }
} }
printf("\n"); printf("\n");
llama_print_timings(ctx_llava->ctx_llama); llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL; ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);

View file

@ -1,9 +1,416 @@
import argparse # coding=utf-8
# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Siglip model. """
# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
import os import os
import math
import warnings
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn.init import _calculate_fan_in_and_fan_out
from transformers.activations import ACT2FN
from transformers.modeling_utils import PreTrainedModel
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import (
logging,
)
from transformers.utils import logging
logger = logging.get_logger(__name__)
class SiglipVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
Number of channels in the input images.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
Example:
```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel
>>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = SiglipVisionConfig()
>>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = SiglipVisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "siglip_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=16,
hidden_act="gelu_pytorch_tanh",
layer_norm_eps=1e-6,
attention_dropout=0.0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/siglip-base-patch16-224",
# See all SigLIP models at https://huggingface.co/models?filter=siglip
]
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
def _trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn(
"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.",
stacklevel=2,
)
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor.uniform_(2 * l - 1, 2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
if tensor.dtype in [torch.float16, torch.bfloat16]:
# The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
og_dtype = tensor.dtype
tensor = tensor.to(torch.float32)
tensor.erfinv_()
tensor = tensor.to(og_dtype)
else:
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.0))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
if tensor.dtype == torch.float16:
# The `clamp_` op is not (yet?) defined in float16+cpu
tensor = tensor.to(torch.float32)
tensor.clamp_(min=a, max=b)
tensor = tensor.to(torch.float16)
else:
tensor.clamp_(min=a, max=b)
def trunc_normal_tf_(
tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
):
"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \\leq \text{mean} \\leq b`.
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsquently scaled and shifted by the mean and std args.
Args:
tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution
std: the standard deviation of the normal distribution
a: the minimum cutoff value
b: the maximum cutoff value
"""
with torch.no_grad():
_trunc_normal_(tensor, 0, 1.0, a, b)
tensor.mul_(std).add_(mean)
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
denom = fan_in
if mode == "fan_in":
denom = fan_in
elif mode == "fan_out":
denom = fan_out
elif mode == "fan_avg":
denom = (fan_in + fan_out) / 2
variance = scale / denom
if distribution == "truncated_normal":
# constant is stddev of standard normal truncated to (-2, 2)
trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
elif distribution == "normal":
with torch.no_grad():
tensor.normal_(std=math.sqrt(variance))
elif distribution == "uniform":
bound = math.sqrt(3 * variance)
with torch.no_grad():
tensor.uniform_(-bound, bound)
else:
raise ValueError(f"invalid distribution {distribution}")
def lecun_normal_(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
def default_flax_embed_init(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="normal")
class SiglipVisionEmbeddings(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
padding="valid",
)
self.num_patches_per_side = self.image_size // self.patch_size
self.num_patches = self.num_patches_per_side**2
self.num_positions = self.num_patches
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
class SiglipAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
class SiglipMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
class SiglipEncoderLayer(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.embed_dim = config.hidden_size
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self.self_attn = (
SiglipAttention(config)
)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = SiglipMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
class SiglipPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SiglipVisionConfig
base_model_prefix = "siglip"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, SiglipVisionEmbeddings):
width = self.config.hidden_size
nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
elif isinstance(module, nn.Embedding):
default_flax_embed_init(module.weight)
elif isinstance(module, SiglipAttention):
nn.init.normal_(module.q_proj.weight)
nn.init.normal_(module.k_proj.weight)
nn.init.normal_(module.v_proj.weight)
nn.init.normal_(module.out_proj.weight)
nn.init.zeros_(module.q_proj.bias)
nn.init.zeros_(module.k_proj.bias)
nn.init.zeros_(module.v_proj.bias)
nn.init.zeros_(module.out_proj.bias)
elif isinstance(module, SiglipMLP):
nn.init.normal_(module.fc1.weight)
nn.init.normal_(module.fc2.weight)
nn.init.normal_(module.fc1.bias, std=1e-6)
nn.init.normal_(module.fc2.bias, std=1e-6)
elif isinstance(module, (nn.Linear, nn.Conv2d)):
lecun_normal_(module.weight)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
SIGLIP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SIGLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
class SiglipEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`SiglipEncoderLayer`].
Args:
config: SiglipConfig
"""
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
class SiglipVisionTransformer(SiglipPreTrainedModel):
config_class = SiglipVisionConfig
main_input_name = "pixel_values"
_supports_flash_attn_2 = True
def __init__(self, config: SiglipVisionConfig):
super().__init__(config)
self.config = config
embed_dim = config.hidden_size
self.embeddings = SiglipVisionEmbeddings(config)
self.encoder = SiglipEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.embeddings.patch_embedding
import argparse
import json import json
import re import re
import torch
import numpy as np import numpy as np
from gguf import * from gguf import *
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
@ -94,6 +501,7 @@ default_image_mean = [0.48145466, 0.4578275, 0.40821073]
default_image_std = [0.26862954, 0.26130258, 0.27577711] default_image_std = [0.26862954, 0.26130258, 0.27577711]
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
# with proper # with proper
args = ap.parse_args() args = ap.parse_args()
@ -135,6 +543,15 @@ if args.use_f32:
# model = CLIPModel.from_pretrained(dir_model) # model = CLIPModel.from_pretrained(dir_model)
# processor = CLIPProcessor.from_pretrained(dir_model) # processor = CLIPProcessor.from_pretrained(dir_model)
minicpmv_version = args.minicpmv_version
emb_dim = 4096
if minicpmv_version == 1:
emb_dim = 2304
elif minicpmv_version == 2:
emb_dim = 4096
elif minicpmv_version == 3:
emb_dim = 3584
default_vision_config = { default_vision_config = {
"hidden_size": 1152, "hidden_size": 1152,
"image_size": 980, "image_size": 980,
@ -144,8 +561,12 @@ default_vision_config = {
"num_hidden_layers": 27, "num_hidden_layers": 27,
"patch_size": 14, "patch_size": 14,
} }
vision_config = Idefics2VisionConfig(**default_vision_config) vision_config = Idefics2VisionConfig(**default_vision_config)
model = Idefics2VisionTransformer(vision_config) model = Idefics2VisionTransformer(vision_config)
if minicpmv_version == 3:
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
processor = None processor = None
# if model.attn_pool is not None: # if model.attn_pool is not None:
@ -158,6 +579,7 @@ fname_middle = None
has_text_encoder = True has_text_encoder = True
has_vision_encoder = True has_vision_encoder = True
has_minicpmv_projector = False has_minicpmv_projector = False
if args.text_only: if args.text_only:
fname_middle = "text-" fname_middle = "text-"
has_vision_encoder = False has_vision_encoder = False
@ -165,6 +587,7 @@ elif args.minicpmv_projector is not None:
fname_middle = "mmproj-" fname_middle = "mmproj-"
has_text_encoder = False has_text_encoder = False
has_minicpmv_projector = True has_minicpmv_projector = True
minicpmv_version = 3
elif args.vision_only: elif args.vision_only:
fname_middle = "vision-" fname_middle = "vision-"
has_text_encoder = False has_text_encoder = False
@ -189,6 +612,7 @@ elif has_minicpmv_projector:
fout.add_description("image encoder for MiniCPM-V") fout.add_description("image encoder for MiniCPM-V")
# add projector type # add projector type
fout.add_string("clip.projector_type", "resampler") fout.add_string("clip.projector_type", "resampler")
fout.add_int32("clip.minicpmv_version", minicpmv_version)
else: else:
fout.add_description("two-tower CLIP model") fout.add_description("two-tower CLIP model")
@ -274,11 +698,11 @@ def _replace_name_resampler(s, v):
if re.match("resampler.pos_embed", s): if re.match("resampler.pos_embed", s):
return { return {
s: v, s: v,
re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
} }
if re.match("resampler.proj", s): if re.match("resampler.proj", s):
return { return {
re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
} }
if re.match("resampler.attn.in_proj_.*", s): if re.match("resampler.attn.in_proj_.*", s):

View file

@ -4,7 +4,7 @@ import torch
from transformers import AutoModel, AutoTokenizer from transformers import AutoModel, AutoTokenizer
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model") ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
args = ap.parse_args() args = ap.parse_args()
# find the model part that includes the the multimodal projector weights # find the model part that includes the the multimodal projector weights
@ -29,7 +29,6 @@ if len(clip_tensors) > 0:
f.write("{}\n") f.write("{}\n")
config = model.llm.config config = model.llm.config
config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5"
config.auto_map = { config.auto_map = {
"AutoConfig": "configuration_minicpm.MiniCPMConfig", "AutoConfig": "configuration_minicpm.MiniCPMConfig",
"AutoModel": "modeling_minicpm.MiniCPMModel", "AutoModel": "modeling_minicpm.MiniCPMModel",
@ -40,7 +39,6 @@ config.auto_map = {
model.llm.save_pretrained(f"{args.model}/model") model.llm.save_pretrained(f"{args.model}/model")
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
tok.save_pretrained(f"{args.model}/model") tok.save_pretrained(f"{args.model}/model")
# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py")
print("Done!") print("Done!")
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")

View file

@ -2,4 +2,4 @@
--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://download.pytorch.org/whl/cpu
pillow~=10.2.0 pillow~=10.2.0
torch~=2.2.1 torch~=2.2.1
torchvision==0.17.1 torchvision~=0.17.1

View file

@ -1,7 +1,9 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cmath>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
@ -37,23 +39,18 @@ struct ngram_container {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
const int W = 15; // lookahead window const int W = 15; // lookahead window
const int N = 5; // n-gram size const int N = 5; // n-gram size
const int G = 15; // max verification n-grams const int G = 15; // max verification n-grams
const bool dump_kv_cache = params.dump_kv_cache; const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookahead", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
const int max_tokens_list_size = max_context_size - 4; const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) { if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1; return 1;
} }
fprintf(stderr, "\n\n"); LOG("\n\n");
for (auto id : inp) { for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", llama_token_to_piece(ctx, id).c_str());
} }
fflush(stderr); fflush(stderr);
@ -118,7 +115,7 @@ int main(int argc, char ** argv) {
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
// target model sampling context // target model sampling context
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
// verification n-grams // verification n-grams
std::vector<ngram_data> ngrams_cur(G); std::vector<ngram_data> ngrams_cur(G);
@ -159,14 +156,14 @@ int main(int argc, char ** argv) {
// sample first token // sample first token
{ {
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0); id = gpt_sampler_sample(smpl, ctx, 0);
llama_sampling_accept(ctx_sampling, ctx, id, true); gpt_sampler_accept(smpl, id, true);
{ {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
fflush(stdout); fflush(stdout);
} }
} }
@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__); LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
return 1; return 1;
} }
@ -284,19 +281,19 @@ int main(int argc, char ** argv) {
} }
// sample the next token // sample the next token
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch); id = gpt_sampler_sample(smpl, ctx, i_batch);
llama_sampling_accept(ctx_sampling, ctx, id, true); gpt_sampler_accept(smpl, id, true);
// print // print
{ {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
if (v == 0) { if (v == 0) {
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} else { } else {
// print light cyan // print light cyan
printf("\033[0;96m%s\033[0m", token_str.c_str()); LOG("\033[0;96m%s\033[0m", token_str.c_str());
} }
fflush(stdout); fflush(stdout);
@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
// print known n-grams starting with token id (debug) // print known n-grams starting with token id (debug)
if (0 && v == 0) { if (0 && v == 0) {
if (ngrams_observed.cnt[id] > 0) { if (ngrams_observed.cnt[id] > 0) {
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
} }
for (int i = 0; i < ngrams_observed.cnt[id]; i++) { for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
printf(" - ngram %2d: ", i); LOG(" - ngram %2d: ", i);
const int idx = id*(N - 1)*G + i*(N - 1); const int idx = id*(N - 1)*G + i*(N - 1);
for (int j = 0; j < N - 1; j++) { for (int j = 0; j < N - 1; j++) {
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} }
printf("\n"); LOG("\n");
} }
} }
@ -361,7 +358,7 @@ int main(int argc, char ** argv) {
if (v == 0) { if (v == 0) {
// sample from the last level // sample from the last level
for (int i = 0; i < W; i++) { for (int i = 0; i < W; i++) {
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i); tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
} }
} else { } else {
for (int i = 0; i < W; i++) { for (int i = 0; i < W; i++) {
@ -455,23 +452,25 @@ int main(int argc, char ** argv) {
auto t_dec_end = ggml_time_us(); auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n"); LOG("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("W = %2d\n", W); LOG_INF("W = %2d\n", W);
LOG_TEE("N = %2d\n", N); LOG_INF("N = %2d\n", N);
LOG_TEE("G = %2d\n", G); LOG_INF("G = %2d\n", G);
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("n_predict = %d\n", n_predict); LOG_INF("n_predict = %d\n", n_predict);
LOG_TEE("n_accept = %d\n", n_accept); LOG_INF("n_accept = %d\n", n_accept);
llama_print_timings(ctx); LOG_INF("\n");
gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl);
llama_kv_cache_view_free(&kvc_view); llama_kv_cache_view_free(&kvc_view);
llama_sampling_free(ctx_sampling);
llama_batch_free(batch); llama_batch_free(batch);
@ -480,7 +479,7 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -1,7 +1,8 @@
#include "ggml.h" #include "arg.h"
#include "llama.h"
#include "common.h" #include "common.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "ggml.h"
#include "llama.h"
#include <cstdint> #include <cstdint>
#include <fstream> #include <fstream>
@ -13,8 +14,7 @@
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
@ -40,4 +40,6 @@ int main(int argc, char ** argv){
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
return 0;
} }

View file

@ -1,25 +1,26 @@
#include "ggml.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h"
#include "log.h" #include "log.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "llama.h"
#include "ggml.h"
#include <cmath>
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#include <cinttypes>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <unordered_map>
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
const int n_draft = params.n_draft; const int n_draft = params.n_draft;
// init llama.cpp // init llama.cpp
@ -49,7 +50,7 @@ int main(int argc, char ** argv){
try { try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) { } catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1); exit(1);
} }
} }
@ -128,7 +129,7 @@ int main(int argc, char ** argv){
const int64_t eta_min = eta_ms / (60*1000); const int64_t eta_min = eta_ms / (60*1000);
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
} }
// After each chunk, update the dynamic ngram cache with the context ngram cache: // After each chunk, update the dynamic ngram cache with the context ngram cache:
@ -136,24 +137,24 @@ int main(int argc, char ** argv){
ngram_cache_context.clear(); ngram_cache_context.clear();
} }
LOG_TEE("\n"); LOG("\n");
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("n_draft = %d\n", n_draft); LOG_INF("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx); LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
LOG_TEE("n_drafted = %d\n", n_drafted); LOG_INF("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept); LOG_INF("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -1,35 +1,31 @@
#include "arg.h"
#include "ggml.h" #include "ggml.h"
#include "llama.h"
#include "common.h" #include "common.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cmath>
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <unordered_map>
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
// max. number of additional tokens to draft if match is found // max. number of additional tokens to draft if match is found
const int n_draft = params.n_draft; const int n_draft = params.n_draft;
const bool dump_kv_cache = params.dump_kv_cache; const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookup", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -59,7 +55,7 @@ int main(int argc, char ** argv){
try { try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) { } catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1); exit(1);
} }
} }
@ -77,14 +73,14 @@ int main(int argc, char ** argv){
const int max_tokens_list_size = max_context_size - 4; const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) { if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1; return 1;
} }
fprintf(stderr, "\n\n"); LOG("\n\n");
for (auto id : inp) { for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", llama_token_to_piece(ctx, id).c_str());
} }
fflush(stderr); fflush(stderr);
@ -106,7 +102,7 @@ int main(int argc, char ** argv){
bool has_eos = false; bool has_eos = false;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
std::vector<llama_token> draft; std::vector<llama_token> draft;
@ -125,19 +121,19 @@ int main(int argc, char ** argv){
} }
// print current draft sequence // print current draft sequence
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str()); LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
int i_dft = 0; int i_dft = 0;
while (true) { while (true) {
// sample from the target model // sample from the target model
llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft); llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
llama_sampling_accept(ctx_sampling, ctx, id, true); gpt_sampler_accept(smpl, id, true);
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
if (!params.use_color) { if (!params.use_color) {
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} }
if (llama_token_is_eog(model, id)) { if (llama_token_is_eog(model, id)) {
@ -148,7 +144,7 @@ int main(int argc, char ** argv){
// check if the target token matches the draft // check if the target token matches the draft
if (i_dft < (int) draft.size() && id == draft[i_dft]) { if (i_dft < (int) draft.size() && id == draft[i_dft]) {
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept; ++n_accept;
++n_past; ++n_past;
++i_dft; ++i_dft;
@ -162,19 +158,19 @@ int main(int argc, char ** argv){
if (params.use_color) { if (params.use_color) {
// color accepted draft token // color accepted draft token
printf("\033[34m%s\033[0m", token_str.c_str()); LOG("\033[34m%s\033[0m", token_str.c_str());
fflush(stdout); fflush(stdout);
} }
continue; continue;
} }
if (params.use_color) { if (params.use_color) {
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} }
fflush(stdout); fflush(stdout);
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
draft.clear(); draft.clear();
draft.push_back(id); draft.push_back(id);
@ -225,25 +221,26 @@ int main(int argc, char ** argv){
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
LOG_TEE("\n\n"); LOG("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("n_draft = %d\n", n_draft); LOG_INF("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict); LOG_INF("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted); LOG_INF("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept); LOG_INF("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ntarget:\n"); LOG_INF("\ntarget:\n\n");
llama_print_timings(ctx); gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl);
llama_sampling_free(ctx_sampling);
llama_batch_free(batch_tgt); llama_batch_free(batch_tgt);
llama_free(ctx); llama_free(ctx);
@ -251,7 +248,7 @@ int main(int argc, char ** argv){
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -161,6 +161,8 @@ A value of -1 will enable infinite text generation, even though we have a finite
If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled. If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
### Temperature ### Temperature

View file

@ -1,11 +1,11 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
#include "log.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cassert> #include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
@ -33,6 +33,7 @@
static llama_context ** g_ctx; static llama_context ** g_ctx;
static llama_model ** g_model; static llama_model ** g_model;
static gpt_sampler ** g_smpl;
static gpt_params * g_params; static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens; static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss; static std::ostringstream * g_output_ss;
@ -40,6 +41,15 @@ static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false; static bool is_interacting = false;
static bool need_insert_eot = false; static bool need_insert_eot = false;
static void print_usage(int argc, char ** argv) {
(void) argc;
LOG("\nexample usage:\n");
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
LOG("\n");
}
static bool file_exists(const std::string & path) { static bool file_exists(const std::string & path) {
std::ifstream f(path.c_str()); std::ifstream f(path.c_str());
return f.good(); return f.good();
@ -65,8 +75,7 @@ static void write_logfile(
const bool success = fs_create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
__func__, params.logdir.c_str());
return; return;
} }
@ -74,7 +83,7 @@ static void write_logfile(
FILE * logfile = fopen(logfile_path.c_str(), "w"); FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) { if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return; return;
} }
@ -92,7 +101,7 @@ static void write_logfile(
yaml_dump_string_multiline(logfile, "output", output.c_str()); yaml_dump_string_multiline(logfile, "output", output.c_str());
yaml_dump_vector_int(logfile, "output_tokens", output_tokens); yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx); llama_perf_dump_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
} }
@ -104,50 +113,38 @@ static void sigint_handler(int signo) {
need_insert_eot = true; need_insert_eot = true;
} else { } else {
console::cleanup(); console::cleanup();
printf("\n"); LOG("\n");
llama_print_timings(*g_ctx); gpt_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
// make sure all logs are flushed
LOG("Interrupted by user\n");
gpt_log_pause(gpt_log_main());
_exit(130); _exit(130);
} }
} }
} }
#endif #endif
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
llama_chat_msg new_msg{role, content}; llama_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single( auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content}); chat_msgs.push_back({role, content});
LOG("formatted: %s\n", formatted.c_str()); LOG_DBG("formatted: '%s'\n", formatted.c_str());
return formatted; return formatted;
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
llama_sampling_params & sparams = params.sparams; gpt_init();
#ifndef LOG_DISABLE_LOGS auto & sparams = params.sparams;
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
// save choice to use color for later // save choice to use color for later
// (note for later: this is a slightly awkward choice) // (note for later: this is a slightly awkward choice)
@ -155,123 +152,141 @@ int main(int argc, char ** argv) {
atexit([]() { console::cleanup(); }); atexit([]() { console::cleanup(); });
if (params.logits_all) { if (params.logits_all) {
printf("\n************\n"); LOG_ERR("************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n"); LOG_ERR("************\n\n");
return 0; return 0;
} }
if (params.embedding) { if (params.embedding) {
printf("\n************\n"); LOG_ERR("************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n"); LOG_ERR("************\n\n");
return 0; return 0;
} }
if (params.n_ctx != 0 && params.n_ctx < 8) { if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8; params.n_ctx = 8;
} }
if (params.rope_freq_base != 0.0) { if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
} }
if (params.rope_freq_scale != 0.0) { if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
} }
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); LOG_INF("%s: llama backend init\n", __func__);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
LOG("%s: llama backend init\n", __func__);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
llama_model * model; llama_model * model = nullptr;
llama_context * ctx; llama_context * ctx = nullptr;
llama_context * ctx_guidance = NULL; gpt_sampler * smpl = nullptr;
std::vector<llama_chat_msg> chat_msgs; std::vector<llama_chat_msg> chat_msgs;
g_model = &model; g_model = &model;
g_ctx = &ctx; g_ctx = &ctx;
g_smpl = &smpl;
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__); LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params); llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
if (sparams.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
ctx_guidance = llama_new_context_with_model(model, lparams);
}
if (model == NULL) { if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__); LOG_ERR("%s: error: unable to load model\n", __func__);
return 1; return 1;
} }
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams);
set_process_priority(params.cpuparams.priority);
struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch);
if (!threadpool_batch) {
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
return 1;
}
// Start the non-batch threadpool in the paused state
tpp.paused = true;
}
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
return 1;
}
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
__func__, n_ctx_train, n_ctx);
} }
// print chat template example in conversation mode // print chat template example in conversation mode
if (params.conversation) { if (params.conversation) {
if (params.enable_chat_template) { if (params.enable_chat_template) {
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
} else { } else {
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
} }
} }
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
} }
std::string path_session = params.path_prompt_cache; std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens; std::vector<llama_token> session_tokens;
if (!path_session.empty()) { if (!path_session.empty()) {
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
if (!file_exists(path_session)) { if (!file_exists(path_session)) {
LOG_TEE("%s: session file does not exist, will create.\n", __func__); LOG_INF("%s: session file does not exist, will create.\n", __func__);
} else if (file_is_empty(path_session)) { } else if (file_is_empty(path_session)) {
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
} else { } else {
// The file exists and is not empty // The file exists and is not empty
session_tokens.resize(n_ctx); session_tokens.resize(n_ctx);
size_t n_token_count_out = 0; size_t n_token_count_out = 0;
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1; return 1;
} }
session_tokens.resize(n_token_count_out); session_tokens.resize(n_token_count_out);
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
} }
} }
const bool add_bos = llama_should_add_bos_token(model); const bool add_bos = llama_add_bos_token(model);
if (!llama_model_has_encoder(model)) { if (!llama_model_has_encoder(model)) {
GGML_ASSERT(llama_add_eos_token(model) != 1); GGML_ASSERT(!llama_add_eos_token(model));
} }
LOG("add_bos: %d\n", add_bos);
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
@ -280,49 +295,31 @@ int main(int argc, char ** argv) {
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt; : params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n"); LOG_DBG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, prompt, true, true); embd_inp = ::llama_tokenize(ctx, prompt, true, true);
} else { } else {
LOG("use session tokens\n"); LOG_DBG("use session tokens\n");
embd_inp = session_tokens; embd_inp = session_tokens;
} }
LOG("prompt: \"%s\"\n", log_tostr(prompt)); LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
} }
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
if (add_bos) { if (add_bos) {
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else { } else {
LOG_TEE("error: input is empty\n"); LOG_ERR("input is empty\n");
return -1; return -1;
} }
} }
// Tokenize negative prompt // Tokenize negative prompt
std::vector<llama_token> guidance_inp;
int guidance_offset = 0;
int original_prompt_len = 0;
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
LOG("guidance_offset: %s", log_tostr(guidance_offset));
}
if ((int) embd_inp.size() > n_ctx - 4) { if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; return 1;
} }
@ -336,29 +333,28 @@ int main(int argc, char ** argv) {
n_matching_session_tokens++; n_matching_session_tokens++;
} }
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
LOG_TEE("%s: using full prompt from session file\n", __func__); LOG_INF("%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) { } else if (n_matching_session_tokens >= embd_inp.size()) {
LOG_TEE("%s: session file has exact match for prompt!\n", __func__); LOG_INF("%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) { } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} else { } else {
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} }
// remove any "future" tokens that we might have inherited from the previous session // remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
} }
LOGLN( LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu", embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force // if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token to recalculate the cached logits // reevaluation of the last token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
session_tokens.resize(embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1);
} }
@ -380,30 +376,20 @@ int main(int argc, char ** argv) {
} }
if (params.verbose_prompt) { if (params.verbose_prompt) {
LOG_TEE("\n"); LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (ctx_guidance) {
LOG_TEE("\n");
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
} }
if (params.n_keep > add_bos) { if (params.n_keep > add_bos) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__); LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
LOG_TEE("'\n"); LOG_CNT("'\n");
} }
LOG_TEE("\n"); LOG_INF("\n");
} }
// ctrl+C handling // ctrl+C handling
@ -423,47 +409,56 @@ int main(int argc, char ** argv) {
} }
if (params.interactive) { if (params.interactive) {
LOG_TEE("%s: interactive mode on.\n", __func__); LOG_INF("%s: interactive mode on.\n", __func__);
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) { for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n"); LOG_INF("Input prefix with BOS\n");
} }
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
} }
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); smpl = gpt_sampler_init(model, sparams);
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
return 1;
}
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
// group-attention state // group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1) // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@ -477,9 +472,9 @@ int main(int argc, char ** argv) {
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
} }
LOG_TEE("\n\n"); LOG_INF("\n");
if (params.interactive) { if (params.interactive) {
const char * control_message; const char * control_message;
@ -491,11 +486,11 @@ int main(int argc, char ** argv) {
" - To return control without starting a new line, end your input with '/'.\n" " - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n"; " - If you want to submit another line, end your input with '\\'.\n";
} }
LOG_TEE("== Running in interactive mode. ==\n"); LOG_INF("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); LOG_INF( " - Press Ctrl+C to interject at any time.\n");
#endif #endif
LOG_TEE( "%s\n", control_message); LOG_INF( "%s\n", control_message);
is_interacting = params.interactive_first; is_interacting = params.interactive_first;
} }
@ -509,7 +504,6 @@ int main(int argc, char ** argv) {
int n_remain = params.n_predict; int n_remain = params.n_predict;
int n_consumed = 0; int n_consumed = 0;
int n_session_consumed = 0; int n_session_consumed = 0;
int n_past_guidance = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens; std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens; std::vector<int> output_tokens; g_output_tokens = &output_tokens;
@ -521,7 +515,6 @@ int main(int argc, char ** argv) {
display = params.display_prompt; display = params.display_prompt;
std::vector<llama_token> embd; std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
// tokenized antiprompts // tokenized antiprompts
std::vector<std::vector<llama_token>> antiprompt_ids; std::vector<std::vector<llama_token>> antiprompt_ids;
@ -531,18 +524,12 @@ int main(int argc, char ** argv) {
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
} }
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
if (!ctx_sampling) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
if (llama_model_has_encoder(model)) { if (llama_model_has_encoder(model)) {
int enc_input_size = embd_inp.size(); int enc_input_size = embd_inp.size();
llama_token * enc_input_buf = embd_inp.data(); llama_token * enc_input_buf = embd_inp.data();
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return 1; return 1;
} }
@ -568,9 +555,8 @@ int main(int argc, char ** argv) {
embd.resize(max_embd_size); embd.resize(max_embd_size);
console::set_display(console::error); console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset); console::set_display(console::reset);
fflush(stdout);
} }
if (ga_n == 1) { if (ga_n == 1) {
@ -578,33 +564,35 @@ int main(int argc, char ** argv) {
// if we run out of context: // if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past) // - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
if (params.n_predict == -2) { if (n_past + (int) embd.size() >= n_ctx) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); if (!params.ctx_shift){
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
break; break;
} else {
if (params.n_predict == -2) {
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
LOG_DBG("clear session path\n");
path_session.clear();
} }
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
if (ctx_guidance) {
n_past_guidance -= n_discard;
}
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG("clear session path\n");
path_session.clear();
} }
} else { } else {
// context extension via Self-Extend // context extension via Self-Extend
@ -613,10 +601,10 @@ int main(int argc, char ** argv) {
const int bd = (ga_w/ga_n)*(ga_n - 1); const int bd = (ga_w/ga_n)*(ga_n - 1);
const int dd = (ga_w/ga_n) - ib*bd - ga_w; const int dd = (ga_w/ga_n) - ib*bd - ga_w;
LOG("\n"); LOG_DBG("\n");
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
@ -626,7 +614,7 @@ int main(int argc, char ** argv) {
ga_i += ga_w/ga_n; ga_i += ga_w/ga_n;
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
} }
} }
@ -652,65 +640,25 @@ int main(int argc, char ** argv) {
} }
} }
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
if (ctx_guidance) {
int input_size = 0;
llama_token * input_buf = NULL;
if (n_past_guidance < (int) guidance_inp.size()) {
// Guidance context should have the same data with these modifications:
//
// * Replace the initial prompt
// * Shift everything by guidance_offset
embd_guidance = guidance_inp;
if (embd.begin() + original_prompt_len < embd.end()) {
embd_guidance.insert(
embd_guidance.end(),
embd.begin() + original_prompt_len,
embd.end()
);
}
input_buf = embd_guidance.data();
input_size = embd_guidance.size();
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
} else {
input_buf = embd.data();
input_size = embd.size();
}
for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch);
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
n_past_guidance += n_eval;
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) { for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i; int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) { if (n_eval > params.n_batch) {
n_eval = params.n_batch; n_eval = params.n_batch;
} }
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return 1; return 1;
} }
n_past += n_eval; n_past += n_eval;
LOG("n_past = %d\n", n_past); LOG_DBG("n_past = %d\n", n_past);
// Display total tokens alongside total time // Display total tokens alongside total time
if (params.n_print > 0 && n_past % params.n_print == 0) { if (params.n_print > 0 && n_past % params.n_print == 0) {
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
} }
} }
@ -721,7 +669,6 @@ int main(int argc, char ** argv) {
} }
embd.clear(); embd.clear();
embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// optionally save the session on first sample (for faster prompt loading next time) // optionally save the session on first sample (for faster prompt loading next time)
@ -729,14 +676,14 @@ int main(int argc, char ** argv) {
need_to_save_session = false; need_to_save_session = false;
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
LOG("saved session to %s\n", path_session.c_str()); LOG_DBG("saved session to %s\n", path_session.c_str());
} }
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true); gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id); embd.push_back(id);
@ -746,16 +693,16 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget // decrement remaining sampling budget
--n_remain; --n_remain;
LOG("n_remain: %d\n", n_remain); LOG_DBG("n_remain: %d\n", n_remain);
} else { } else {
// some user input remains from prompt or interaction, forward it to processing // some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) { while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]); embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later // push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules // for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false); gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
@ -770,7 +717,7 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx, id, params.special); const std::string token_str = llama_token_to_piece(ctx, id, params.special);
// Console/Stream Output // Console/Stream Output
fprintf(stdout, "%s", token_str.c_str()); LOG("%s", token_str.c_str());
// Record Displayed Tokens To Log // Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check // Note: Generated tokens are created one by one hence this check
@ -782,8 +729,6 @@ int main(int argc, char ** argv) {
output_tokens.push_back(id); output_tokens.push_back(id);
output_ss << token_str; output_ss << token_str;
} }
fflush(stdout);
} }
} }
@ -798,7 +743,7 @@ int main(int argc, char ** argv) {
// check for reverse prompt in the last n_prev tokens // check for reverse prompt in the last n_prev tokens
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
const int n_prev = 32; const int n_prev = 32;
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev); const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
is_antiprompt = false; is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output. // Check if each of the reverse prompts appears at the end of the output.
@ -820,7 +765,7 @@ int main(int argc, char ** argv) {
} }
// check for reverse prompt using special tokens // check for reverse prompt using special tokens
llama_token last_token = llama_sampling_last(ctx_sampling); llama_token last_token = gpt_sampler_last(smpl);
for (std::vector<llama_token> ids : antiprompt_ids) { for (std::vector<llama_token> ids : antiprompt_ids) {
if (ids.size() == 1 && last_token == ids[0]) { if (ids.size() == 1 && last_token == ids[0]) {
if (params.interactive) { if (params.interactive) {
@ -832,13 +777,13 @@ int main(int argc, char ** argv) {
} }
if (is_antiprompt) { if (is_antiprompt) {
LOG("found antiprompt: %s\n", last_output.c_str()); LOG_DBG("found antiprompt: %s\n", last_output.c_str());
} }
} }
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
LOG("found an EOG token\n"); LOG_DBG("found an EOG token\n");
if (params.interactive) { if (params.interactive) {
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
@ -852,32 +797,32 @@ int main(int argc, char ** argv) {
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
} }
is_interacting = true; is_interacting = true;
printf("\n"); LOG("\n");
} }
} }
// if current token is not EOG, we add it to current assistant message // if current token is not EOG, we add it to current assistant message
if (params.conversation) { if (params.conversation) {
auto id = llama_sampling_last(ctx_sampling); const auto id = gpt_sampler_last(smpl);
assistant_ss << llama_token_to_piece(ctx, id, false); assistant_ss << llama_token_to_piece(ctx, id, false);
} }
if (n_past > 0 && is_interacting) { if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n"); LOG_DBG("waiting for user input\n");
if (params.conversation) { if (params.conversation) {
printf("\n> "); LOG("\n> ");
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n"); LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_token_bos(model));
} }
std::string buffer; std::string buffer;
if (!params.input_prefix.empty() && !params.conversation) { if (!params.input_prefix.empty() && !params.conversation) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
printf("%s", params.input_prefix.c_str()); LOG("%s", params.input_prefix.c_str());
} }
// color user input only // color user input only
@ -900,11 +845,11 @@ int main(int argc, char ** argv) {
if (buffer.length() > 1) { if (buffer.length() > 1) {
// append input suffix if any // append input suffix if any
if (!params.input_suffix.empty() && !params.conversation) { if (!params.input_suffix.empty() && !params.conversation) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
printf("%s", params.input_suffix.c_str()); LOG("%s", params.input_suffix.c_str());
} }
LOG("buffer: '%s'\n", buffer.c_str()); LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size(); const size_t original_size = embd_inp.size();
@ -921,7 +866,7 @@ int main(int argc, char ** argv) {
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
// if user stop generation mid-way, we must add EOT to finish model's last response // if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) { if (need_insert_eot && format_chat) {
@ -944,9 +889,9 @@ int main(int argc, char ** argv) {
assistant_ss.str(""); assistant_ss.str("");
n_remain -= line_inp.size(); n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain); LOG_DBG("n_remain: %d\n", n_remain);
} else { } else {
LOG("empty line, passing control back\n"); LOG_DBG("empty line, passing control back\n");
} }
input_echo = false; // do not echo this again input_echo = false; // do not echo this again
@ -954,7 +899,7 @@ int main(int argc, char ** argv) {
if (n_past > 0) { if (n_past > 0) {
if (is_interacting) { if (is_interacting) {
llama_sampling_reset(ctx_sampling); gpt_sampler_reset(smpl);
} }
is_interacting = false; is_interacting = false;
} }
@ -962,7 +907,7 @@ int main(int argc, char ** argv) {
// end of generation // end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
LOG_TEE(" [end of text]\n"); LOG(" [end of text]\n");
break; break;
} }
@ -975,23 +920,23 @@ int main(int argc, char ** argv) {
} }
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
} }
llama_print_timings(ctx); LOG("\n\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
if (ctx_guidance) { llama_free(ctx_guidance); } gpt_sampler_free(smpl);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_sampling_free(ctx_sampling);
llama_backend_free(); llama_backend_free();
#ifndef LOG_DISABLE_LOGS ggml_threadpool_free(threadpool);
LOG_TEE("Log end\n"); ggml_threadpool_free(threadpool_batch);
#endif // LOG_DISABLE_LOGS
return 0; return 0;
} }

View file

@ -1,7 +1,10 @@
// A basic application simulating a server with multiple clients. // A basic application simulating a server with multiple clients.
// The clients submit requests to the server and they are processed in parallel. // The clients submit requests to the server and they are processed in parallel.
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cmath> #include <cmath>
@ -50,8 +53,8 @@ static std::vector<std::string> k_prompts = {
struct client { struct client {
~client() { ~client() {
if (ctx_sampling) { if (smpl) {
llama_sampling_free(ctx_sampling); gpt_sampler_free(smpl);
} }
} }
@ -72,7 +75,7 @@ struct client {
std::string prompt; std::string prompt;
std::string response; std::string response;
struct llama_sampling_context * ctx_sampling = nullptr; struct gpt_sampler * smpl = nullptr;
}; };
static void print_date_time() { static void print_date_time() {
@ -81,7 +84,9 @@ static void print_date_time() {
char buffer[80]; char buffer[80];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time); strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer); LOG_INF("\n");
LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
LOG_INF("\n");
} }
// Define a split string function to ... // Define a split string function to ...
@ -100,11 +105,12 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
// number of simultaneous "clients" to simulate // number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel; const int32_t n_clients = params.n_parallel;
@ -119,12 +125,6 @@ int main(int argc, char ** argv) {
const bool dump_kv_cache = params.dump_kv_cache; const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("parallel", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -137,23 +137,22 @@ int main(int argc, char ** argv) {
// load the prompts from an external file if there are any // load the prompts from an external file if there are any
if (params.prompt.empty()) { if (params.prompt.empty()) {
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
} else { } else {
// Output each line of the input params.prompts vector and copy to k_prompts // Output each line of the input params.prompts vector and copy to k_prompts
int index = 0; int index = 0;
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
std::vector<std::string> prompts = split_string(params.prompt, '\n'); std::vector<std::string> prompts = split_string(params.prompt, '\n');
for (const auto& prompt : prompts) { for (const auto& prompt : prompts) {
k_prompts.resize(index + 1); k_prompts.resize(index + 1);
k_prompts[index] = prompt; k_prompts[index] = prompt;
index++; index++;
printf("%3d prompt: %s\n", index, prompt.c_str()); LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
} }
} }
fprintf(stderr, "\n\n"); LOG_INF("\n\n");
fflush(stderr);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -161,7 +160,7 @@ int main(int argc, char ** argv) {
for (size_t i = 0; i < clients.size(); ++i) { for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i]; auto & client = clients[i];
client.id = i; client.id = i;
client.ctx_sampling = llama_sampling_init(params.sparams); client.smpl = gpt_sampler_init(model, params.sparams);
} }
std::vector<llama_token> tokens_system; std::vector<llama_token> tokens_system;
@ -182,19 +181,19 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us(); const auto t_main_start = ggml_time_us();
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__); LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
LOG_TEE("\n"); LOG_INF("\n");
{ {
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__); LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
for (int32_t i = 0; i < n_tokens_system; ++i) { for (int32_t i = 0; i < n_tokens_system; ++i) {
llama_batch_add(batch, tokens_system[i], i, { 0 }, false); llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@ -203,10 +202,10 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
} }
LOG_TEE("\n"); LOG_INF("\n");
} }
LOG_TEE("Processing requests ...\n\n"); LOG_INF("Processing requests ...\n\n");
while (true) { while (true) {
if (dump_kv_cache) { if (dump_kv_cache) {
@ -237,7 +236,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
} }
LOG_TEE("%s: clearing the KV cache\n", __func__); LOG_INF("%s: clearing the KV cache\n", __func__);
} }
// insert new sequences for decoding // insert new sequences for decoding
@ -253,7 +252,7 @@ int main(int argc, char ** argv) {
client.prompt = client.input + "\nAssistant:"; client.prompt = client.input + "\nAssistant:";
client.response = ""; client.response = "";
llama_sampling_reset(client.ctx_sampling); gpt_sampler_reset(client.smpl);
// do not prepend BOS because we have a system prompt! // do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt; std::vector<llama_token> tokens_prompt;
@ -272,7 +271,7 @@ int main(int argc, char ** argv) {
client.n_decoded = 0; client.n_decoded = 0;
client.i_batch = batch.n_tokens - 1; client.i_batch = batch.n_tokens - 1;
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
g_seq_id += 1; g_seq_id += 1;
@ -316,11 +315,11 @@ int main(int argc, char ** argv) {
if (ret != 0) { if (ret != 0) {
if (n_batch == 1 || ret < 0) { if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size // if you get here, it means the KV cache is full - try increasing it via the context size
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return 1; return 1;
} }
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
n_cache_miss += 1; n_cache_miss += 1;
@ -331,7 +330,7 @@ int main(int argc, char ** argv) {
continue; continue;
} }
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens); LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
for (auto & client : clients) { for (auto & client : clients) {
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) { if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@ -341,9 +340,9 @@ int main(int argc, char ** argv) {
//printf("client %d, seq %d, token %d, pos %d, batch %d\n", //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i); const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
llama_sampling_accept(client.ctx_sampling, ctx, id, true); gpt_sampler_accept(client.smpl, id, true);
if (client.n_decoded == 1) { if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients // start measuring generation time after the first token to make sure all concurrent clients
@ -371,12 +370,12 @@ int main(int argc, char ** argv) {
} }
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
const auto t_main_end = ggml_time_us(); const auto t_main_end = ggml_time_us();
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n", LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded, client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
(t_main_end - client.t_start_prompt) / 1e6, (t_main_end - client.t_start_prompt) / 1e6,
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6, (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
@ -399,21 +398,22 @@ int main(int argc, char ** argv) {
print_date_time(); print_date_time();
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
if (params.prompt_file.empty()) { if (params.prompt_file.empty()) {
params.prompt_file = "used built-in defaults"; params.prompt_file = "used built-in defaults";
} }
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Cache misses: %6d\n", n_cache_miss); LOG_INF("Cache misses: %6d\n", n_cache_miss);
LOG_TEE("\n"); LOG_INF("\n");
llama_print_timings(ctx); // TODO: print sampling/grammar timings for all clients
llama_perf_context_print(ctx);
llama_batch_free(batch); llama_batch_free(batch);
@ -422,7 +422,7 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -1,4 +1,6 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cmath> #include <cmath>
@ -6,12 +8,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) { static void print_usage(int, char ** argv) {
gpt_params_print_usage(argc, argv, params); LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG_TEE("\nexample usage:\n"); LOG("\n");
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG_TEE("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -21,12 +21,11 @@ int main(int argc, char ** argv) {
params.n_keep = 32; params.n_keep = 32;
params.i_pos = -1; params.i_pos = -1;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed); gpt_init();
int n_junk = params.n_junk; int n_junk = params.n_junk;
int n_keep = params.n_keep; int n_keep = params.n_keep;
@ -67,7 +66,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); LOG_ERR("%s: unable to load model\n" , __func__);
return 1; return 1;
} }
@ -80,12 +79,17 @@ int main(int argc, char ** argv) {
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return 1; return 1;
} }
auto sparams = llama_sampler_chain_default_params();
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true); tokens_list = ::llama_tokenize(ctx, params.prompt, true);
@ -106,14 +110,14 @@ int main(int argc, char ** argv) {
const int n_batch = ctx_params.n_batch; const int n_batch = ctx_params.n_batch;
const int n_batch_grp = ctx_params.n_batch/n_grp; const int n_batch_grp = ctx_params.n_batch/n_grp;
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
// print the prompt token-by-token // print the prompt token-by-token
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("prefix tokens: %d\n", n_tokens_prefix); LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
LOG_TEE("prompt tokens: %d\n", n_tokens_all); LOG_INF("prompt tokens: %d\n", n_tokens_all);
//LOG_TEE("prompt: %s\n", params.prompt.c_str()); //LOG_INF("prompt: %s\n", params.prompt.c_str());
llama_batch batch = llama_batch_init(params.n_batch, 0, 1); llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
@ -144,11 +148,11 @@ int main(int argc, char ** argv) {
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_INF("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
if (i + n_batch >= n_tokens_all) { if (i + n_batch >= n_tokens_all) {
break; break;
@ -158,7 +162,7 @@ int main(int argc, char ** argv) {
for (int i = n_ctx; i < n_tokens_all; i += n_batch) { for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
const int n_discard = n_batch; const int n_discard = n_batch;
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
@ -178,18 +182,18 @@ int main(int argc, char ** argv) {
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
} }
{ {
const int n_discard = n_past - n_ctx + n_predict; const int n_discard = n_past - n_ctx + n_predict;
if (n_discard > 0) { if (n_discard > 0) {
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
@ -200,47 +204,32 @@ int main(int argc, char ** argv) {
} }
} }
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
LOG_TEE("\n"); LOG_INF("\n");
// main loop // main loop
int n_cur = n_tokens_all; int n_cur = n_tokens_all;
int n_decode = 0; int n_decode = 0;
LOG_TEE("%s", prompt_suffix.c_str()); LOG_INF("%s", prompt_suffix.c_str());
fflush(stdout);
const auto t_main_start = ggml_time_us(); const auto t_main_start = ggml_time_us();
while (n_cur <= n_len) { while (n_cur <= n_len) {
// sample the next token // sample the next token
{ {
auto n_vocab = llama_n_vocab(model); const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation? // is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
LOG_TEE("\n"); LOG("\n");
break; break;
} }
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
n_decode += 1; n_decode += 1;
@ -255,21 +244,24 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model // evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) { if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1; return 1;
} }
} }
LOG_TEE("\n"); LOG("\n");
const auto t_main_end = ggml_time_us(); const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
llama_print_timings(ctx); LOG("\n");
llama_perf_context_print(ctx);
fprintf(stderr, "\n"); LOG("\n");
llama_sampler_free(smpl);
llama_batch_free(batch); llama_batch_free(batch);

View file

@ -1,18 +1,21 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <algorithm>
#include <array>
#include <atomic>
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <fstream>
#include <mutex>
#include <random>
#include <sstream> #include <sstream>
#include <thread> #include <thread>
#include <mutex>
#include <atomic>
#include <vector> #include <vector>
#include <array>
#include <fstream>
#include <sstream>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -40,7 +43,7 @@ static void write_logfile(
} }
if (params.hellaswag) { if (params.hellaswag) {
fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
return; return;
} }
@ -48,7 +51,7 @@ static void write_logfile(
const bool success = fs_create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
return; return;
} }
@ -57,7 +60,7 @@ static void write_logfile(
FILE * logfile = fopen(logfile_path.c_str(), "w"); FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) { if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return; return;
} }
@ -76,7 +79,7 @@ static void write_logfile(
fprintf(logfile, "ppl_value: %f\n", results.ppl_value); fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
yaml_dump_vector_float(logfile, "probs", results.probs); yaml_dump_vector_float(logfile, "probs", results.probs);
llama_dump_timing_info_yaml(logfile, ctx); llama_perf_dump_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
} }
@ -340,19 +343,19 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
if (int(tokens.size()) < 2*n_ctx) { if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx); n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}}; return {std::move(tokens), 0., {}, {}};
} }
@ -363,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
prob_history.resize(tokens.size()); prob_history.resize(tokens.size());
if (params.ppl_stride <= 0) { if (params.ppl_stride <= 0) {
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
return {tokens, -1, logit_history, prob_history}; return {tokens, -1, logit_history, prob_history};
} }
const int calc_chunk = n_ctx; const int calc_chunk = n_ctx;
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
if (int(tokens.size()) <= calc_chunk) { if (int(tokens.size()) <= calc_chunk) {
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
tokens.size(), n_ctx, params.ppl_stride); tokens.size(), n_ctx, params.ppl_stride);
return {tokens, -1, logit_history, prob_history}; return {tokens, -1, logit_history, prob_history};
} }
@ -386,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
int count = 0; int count = 0;
double nll = 0.0; double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.ppl_stride; const int start = i * params.ppl_stride;
const int end = start + calc_chunk; const int end = start + calc_chunk;
const int num_batches = (calc_chunk + n_batch - 1) / n_batch; const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
std::vector<float> logits; std::vector<float> logits;
@ -406,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const int batch_start = start + j * n_batch; const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch); const int batch_size = std::min(end - batch_start, n_batch);
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
// TODO: use llama_batch.logits instead of relying on logits_all == true // TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
//fprintf(stderr, "%s : failed to eval\n", __func__); //LOG_ERR("%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history}; return {tokens, -1, logit_history, prob_history};
} }
@ -433,16 +436,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
if (i == 0) { if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count(); const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk); int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) { if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60)); LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60); total_seconds = total_seconds % (60*60);
} }
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); LOG("%.2f minutes\n", total_seconds / 60.0);
} }
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones. // Calculate probability of next token, given the previous ones.
@ -459,13 +462,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
} }
// perplexity is e^(average negative log-likelihood) // perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) { if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else { } else {
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
} }
fflush(stdout);
} }
printf("\n"); LOG("\n");
return {tokens, std::exp(nll / count), logit_history, prob_history}; return {tokens, std::exp(nll / count), logit_history, prob_history};
} }
@ -480,33 +482,33 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
std::ofstream logits_stream; std::ofstream logits_stream;
if (!params.logits_file.empty()) { if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str(), std::ios::binary); logits_stream.open(params.logits_file.c_str(), std::ios::binary);
if (!logits_stream.is_open()) { if (!logits_stream.is_open()) {
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {}; return {};
} }
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8); logits_stream.write("_logits_", 8);
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx)); logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
} }
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (int(tokens.size()) < 2*n_ctx) { if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx); n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}}; return {std::move(tokens), 0., {}, {}};
} }
@ -539,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
logits.reserve((size_t)n_ctx * n_vocab); logits.reserve((size_t)n_ctx * n_vocab);
} }
fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1); std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@ -612,7 +614,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
} }
if (llama_decode(ctx, batch)) { if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_INF("%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history}; return {tokens, -1, logit_history, prob_history};
} }
@ -627,13 +629,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
llama_synchronize(ctx); llama_synchronize(ctx);
const auto t_end = std::chrono::high_resolution_clock::now(); const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration<float>(t_end - t_start).count(); const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total*n_chunk/n_seq); int total_seconds = (int)(t_total*n_chunk/n_seq);
if (total_seconds >= 60*60) { if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60)); LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60); total_seconds = total_seconds % (60*60);
} }
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); LOG("%.2f minutes\n", total_seconds / 60.0);
} }
for (int seq = 0; seq < n_seq_batch; seq++) { for (int seq = 0; seq < n_seq_batch; seq++) {
@ -655,19 +657,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// perplexity is e^(average negative log-likelihood) // perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) { if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
} else { } else {
double av = nll/count; double av = nll/count;
double av2 = nll2/count - av*av; double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1)); if (av2 > 0) av2 = sqrt(av2/(count-1));
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
} }
} }
fflush(stdout);
logits.clear(); logits.clear();
} }
printf("\n"); LOG("\n");
nll2 /= count; nll2 /= count;
nll /= count; nll /= count;
@ -675,9 +676,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
nll2 -= nll * nll; nll2 -= nll * nll;
if (nll2 > 0) { if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1)); nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else { } else {
printf("Unexpected negative standard deviation of log(prob)\n"); LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
} }
llama_batch_free(batch); llama_batch_free(batch);
@ -703,7 +704,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
if (ret != 0) { if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false; return false;
} }
@ -789,15 +790,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
} }
if (prompt_lines.size() % 6 != 0) { if (prompt_lines.size() % 6 != 0) {
fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__); LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
return; return;
} }
size_t hs_task_count = prompt_lines.size()/6; size_t hs_task_count = prompt_lines.size()/6;
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
fprintf(stderr, "================================= is_spm = %d\n", is_spm); LOG_INF("================================= is_spm = %d\n", is_spm);
// The tasks should be randomized so the score stabilizes quickly. // The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true; bool randomize_tasks = true;
@ -824,7 +825,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
std::vector<llama_token> seq_tokens[4]; std::vector<llama_token> seq_tokens[4];
}; };
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
// Select and read data from prompt lines // Select and read data from prompt lines
std::vector<hs_data_t> hs_data(hs_task_count); std::vector<hs_data_t> hs_data(hs_task_count);
@ -870,9 +871,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
} }
} }
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__); LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
printf("\ntask\tacc_norm\n"); LOG("\ntask\tacc_norm\n");
double acc = 0.0f; double acc = 0.0f;
@ -940,7 +941,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
} }
if (i0 == i1) { if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return; return;
} }
@ -948,7 +949,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// decode all tasks [i0, i1) // decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return; return;
} }
@ -998,7 +999,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
} }
} }
//printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx); //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
// If the gold ending got the maximum logprobe add one accuracy point // If the gold ending got the maximum logprobe add one accuracy point
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) { if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@ -1006,8 +1007,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
} }
// Print the accumulated accuracy mean x 100 // Print the accumulated accuracy mean x 100
printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
fflush(stdout);
} }
i0 = i1 - 1; i0 = i1 - 1;
@ -1015,7 +1015,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
llama_batch_free(batch); llama_batch_free(batch);
printf("\n"); LOG("\n");
} }
struct winogrande_entry { struct winogrande_entry {
@ -1059,7 +1059,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
} }
} }
if (ipos != 4) { if (ipos != 4) {
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
continue; continue;
} }
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3) auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@ -1073,13 +1073,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
if (sentence[where] == '_') break; if (sentence[where] == '_') break;
} }
if (where == int(sentence.size())) { if (where == int(sentence.size())) {
printf("%s: no _ in <%s>\n", __func__, sentence.c_str()); LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
continue; continue;
} }
std::istringstream stream(answer.c_str()); std::istringstream stream(answer.c_str());
int i_answer; stream >> i_answer; int i_answer; stream >> i_answer;
if (stream.fail() || i_answer < 1 || i_answer > 2) { if (stream.fail() || i_answer < 1 || i_answer > 2) {
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
continue; continue;
} }
result.emplace_back(); result.emplace_back();
@ -1108,14 +1108,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
auto data = load_winogrande_from_csv(params.prompt); auto data = load_winogrande_from_csv(params.prompt);
if (data.empty()) { if (data.empty()) {
fprintf(stderr, "%s: no tasks\n", __func__); LOG_ERR("%s: no tasks\n", __func__);
return; return;
} }
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size()); LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) { if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
std::mt19937 rng(1); std::mt19937 rng(1);
std::vector<int> aux(data.size()); std::vector<int> aux(data.size());
for (int i = 0; i < int(data.size()); ++i) { for (int i = 0; i < int(data.size()); ++i) {
@ -1133,7 +1133,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
data = std::move(selected); data = std::move(selected);
} }
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); LOG_INF("%s : tokenizing selected tasks\n", __func__);
for (auto & task : data) { for (auto & task : data) {
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@ -1156,7 +1156,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
} }
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -1217,7 +1217,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
} }
if (i0 == i1) { if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return; return;
} }
@ -1225,7 +1225,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
// decode all tasks [i0, i1) // decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return; return;
} }
@ -1285,20 +1285,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
++n_done; ++n_done;
// print the accumulated accuracy mean x 100 // print the accumulated accuracy mean x 100
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer); LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
fflush(stdout);
} }
i0 = i1 - 1; i0 = i1 - 1;
} }
printf("\n"); LOG("\n");
if (n_done < 100) return; if (n_done < 100) return;
const float p = 1.f*n_correct/n_done; const float p = 1.f*n_correct/n_done;
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1)); const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
} }
static bool deserialize_string(std::istream & in, std::string & str) { static bool deserialize_string(std::istream & in, std::string & str) {
@ -1347,7 +1347,7 @@ struct multiple_choice_task {
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) { static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) { if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) { if (log_error) {
printf("%s: found bad task with empty question and/or answers\n", __func__); LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
} }
return false; return false;
} }
@ -1355,7 +1355,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
for (auto& answer : task.mc1.answers) { for (auto& answer : task.mc1.answers) {
if (answer.empty()) { if (answer.empty()) {
if (log_error) { if (log_error) {
printf("%s: found empty answer\n", __func__); LOG_ERR("%s: found empty answer\n", __func__);
} }
return false; return false;
} }
@ -1409,14 +1409,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
uint32_t n_task; uint32_t n_task;
strstream.read((char *)&n_task, sizeof(n_task)); strstream.read((char *)&n_task, sizeof(n_task));
if (strstream.fail() || n_task == 0) { if (strstream.fail() || n_task == 0) {
printf("%s: no tasks\n", __func__); LOG_ERR("%s: no tasks\n", __func__);
return; return;
} }
printf("%s: there are %u tasks in prompt\n", __func__, n_task); LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
std::vector<uint32_t> task_pos(n_task); std::vector<uint32_t> task_pos(n_task);
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t)); strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
if (strstream.fail()) { if (strstream.fail()) {
printf("%s: failed to read task positions from prompt\n", __func__); LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
return; return;
} }
@ -1424,21 +1424,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) { if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
// Use all tasks // Use all tasks
tasks.resize(n_task); tasks.resize(n_task);
printf("%s: reading tasks", __func__); LOG_INF("%s: reading tasks", __func__);
int n_dot = std::max((int) n_task/100, 1); int n_dot = std::max((int) n_task/100, 1);
int i = 0; int i = 0;
for (auto& task : tasks) { for (auto& task : tasks) {
++i; ++i;
if (!task.deserialize(strstream)) { if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d of %u\n", __func__, i, n_task); LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
return; return;
} }
if (i%n_dot == 0) printf("."); if (i%n_dot == 0) LOG(".");
} }
printf("done\n"); LOG("done\n");
} }
else { else {
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
std::mt19937 rng(1); std::mt19937 rng(1);
std::vector<int> aux(n_task); std::vector<int> aux(n_task);
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i; for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@ -1451,18 +1451,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
aux.pop_back(); aux.pop_back();
strstream.seekg(task_pos[idx], std::ios::beg); strstream.seekg(task_pos[idx], std::ios::beg);
if (!task.deserialize(strstream)) { if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
return; return;
} }
} }
n_task = params.multiple_choice_tasks; n_task = params.multiple_choice_tasks;
} }
printf("%s: preparing task data", __func__); LOG_INF("%s: preparing task data", __func__);
fflush(stdout);
if (n_task > 500) { if (n_task > 500) {
printf("..."); LOG("...");
fflush(stdout);
std::atomic<int> counter(0); std::atomic<int> counter(0);
std::atomic<int> n_bad(0); std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx] () { auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@ -1486,11 +1484,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
for (auto& w : workers) w = std::thread(prepare); for (auto& w : workers) w = std::thread(prepare);
prepare(); prepare();
for (auto& w : workers) w.join(); for (auto& w : workers) w.join();
printf("done\n"); LOG("done\n");
fflush(stdout);
int nbad = n_bad; int nbad = n_bad;
if (nbad > 0) { if (nbad > 0) {
printf("%s: found %d malformed tasks\n", __func__, nbad); LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
return; return;
} }
} else { } else {
@ -1502,16 +1499,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
return; return;
} }
if (i_task%n_dot == 0) { if (i_task%n_dot == 0) {
printf("."); LOG(".");
fflush(stdout);
} }
} }
printf("done\n"); LOG("done\n");
} }
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
printf("\ntask\tacc_norm\n"); LOG("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -1590,7 +1586,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
} }
if (i0 == i1) { if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return; return;
} }
@ -1598,7 +1594,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// decode all tasks [i0, i1) // decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return; return;
} }
@ -1622,13 +1618,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// compute the logprobs for each ending of the decoded tasks // compute the logprobs for each ending of the decoded tasks
for (size_t i = i0; i < i1; ++i) { for (size_t i = i0; i < i1; ++i) {
auto & cur_task = tasks[i]; auto & cur_task = tasks[i];
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) { //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
// if (cur_task.mc1.labels[j] == 1) { // if (cur_task.mc1.labels[j] == 1) {
// printf("%d", j+1); // LOG("%d", j+1);
// } // }
//} //}
//printf("\n common_prefix: %zu\n", cur_task.common_prefix); //LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
// get the logits of the last token of the common prefix // get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@ -1640,13 +1636,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
size_t count = 1; size_t count = 1;
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
//printf(" %zu %g\n", ir, eval_results[ir]); //LOG(" %zu %g\n", ir, eval_results[ir]);
++count; ++count;
log_prob += eval_results[ir++]; log_prob += eval_results[ir++];
} }
cur_task.log_probs[s] = log_prob / count; cur_task.log_probs[s] = log_prob / count;
//printf(" Final: %g\n", log_prob / count); //LOG(" Final: %g\n", log_prob / count);
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); //LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
} }
// Find the ending with maximum logprob // Find the ending with maximum logprob
@ -1666,8 +1662,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
++n_done; ++n_done;
// Print the accumulated accuracy mean x 100 // Print the accumulated accuracy mean x 100
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
fflush(stdout);
} }
i0 = i1 - 1; i0 = i1 - 1;
@ -1679,29 +1674,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
float p = 1.f*n_correct/n_done; float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1)); float sigma = sqrt(p*(1-p)/(n_done-1));
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); LOG("\n");
LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
p = 1.f*n_done/n_tot_answers; p = 1.f*n_done/n_tot_answers;
sigma = sqrt(p*(1-p)/(n_done-1)); sigma = sqrt(p*(1-p)/(n_done-1));
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
printf("\n"); LOG_INF("\n");
} }
static void kl_divergence(llama_context * ctx, const gpt_params & params) { static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (params.logits_file.empty()) { if (params.logits_file.empty()) {
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return; return;
} }
std::ifstream in(params.logits_file.c_str(), std::ios::binary); std::ifstream in(params.logits_file.c_str(), std::ios::binary);
if (!in) { if (!in) {
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str()); LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
return; return;
} }
{ {
char check[9]; check[8] = 0; char check[9]; check[8] = 0;
in.read(check, 8); in.read(check, 8);
if (in.fail() || strncmp("_logits_", check, 8) != 0) { if (in.fail() || strncmp("_logits_", check, 8) != 0) {
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
return; return;
} }
} }
@ -1709,7 +1705,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
uint32_t n_ctx; uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx)); in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) { if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx); __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
} }
@ -1717,24 +1713,24 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
in.read((char *)&n_vocab, sizeof(n_vocab)); in.read((char *)&n_vocab, sizeof(n_vocab));
in.read((char *)&n_chunk, sizeof(n_chunk)); in.read((char *)&n_chunk, sizeof(n_chunk));
if (in.fail()) { if (in.fail()) {
fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
return; return;
} }
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) { if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
} }
std::vector<llama_token> tokens(n_ctx * n_chunk); std::vector<llama_token> tokens(n_ctx * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return; return;
} }
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch; const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4; const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@ -1775,7 +1771,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
const auto t_start = std::chrono::high_resolution_clock::now(); const auto t_start = std::chrono::high_resolution_clock::now();
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i); LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
return; return;
} }
@ -1796,7 +1792,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
// TODO: use llama_batch.logits instead of relying on logits_all == true // TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return; return;
} }
@ -1813,16 +1809,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (i == 0) { if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count(); const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk); int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) { if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60)); LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60); total_seconds = total_seconds % (60*60);
} }
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); LOG("%.2f minutes\n", total_seconds / 60.0);
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
} }
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
const int first = n_ctx/2; const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@ -1831,79 +1827,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
p_diff_ptr += n_ctx - 1 - first; p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first;
printf("%4d", i+1); LOG("%4d", i+1);
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first); const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
const double p_diff_rms_val = sqrt(p_diff_mse.first); const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
double p_top_val = 1.*kld.n_same_top/kld.count; double p_top_val = 1.*kld.n_same_top/kld.count;
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1)); double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
printf("\n"); LOG("\n");
fflush(stdout);
logits.clear(); logits.clear();
} }
printf("\n"); LOG("\n");
if (kld.count < 100) return; // we do not wish to do statistics on so few values if (kld.count < 100) return; // we do not wish to do statistics on so few values
std::sort(kld_values.begin(), kld_values.end()); std::sort(kld_values.begin(), kld_values.end());
std::sort(p_diff_values.begin(), p_diff_values.end()); std::sort(p_diff_values.begin(), p_diff_values.end());
printf("====== Perplexity statistics ======\n"); LOG("====== Perplexity statistics ======\n");
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first); const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double ppl_base_val = exp(log_ppl_base.first); const double ppl_base_val = exp(log_ppl_base.first);
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 ) const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second); const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
const double ppl_ratio_val = exp(log_ppl_ratio_val); const double ppl_ratio_val = exp(log_ppl_ratio_val);
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 ) const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov; const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
const double ppl_diff_val = ppl_val - ppl_base_val; const double ppl_diff_val = ppl_val - ppl_base_val;
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov); const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
printf("\n"); LOG("\n");
printf("====== KL divergence statistics ======\n"); LOG("====== KL divergence statistics ======\n");
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
: kld_values[kld_values.size()/2]; : kld_values[kld_values.size()/2];
@ -1915,50 +1909,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)]; return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
}; };
printf("Maximum KLD: %10.6f\n", kld_values.back()); LOG("Maximum KLD: %10.6f\n", kld_values.back());
printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("Median KLD: %10.6f\n", kld_median); LOG("Median KLD: %10.6f\n", kld_median);
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
printf("Minimum KLD: %10.6f\n", kld_values.front()); LOG("Minimum KLD: %10.6f\n", kld_values.front());
printf("\n"); LOG("\n");
printf("====== Token probability statistics ======\n"); LOG("====== Token probability statistics ======\n");
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count); auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1]) auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
: p_diff_values[p_diff_values.size()/2]; : p_diff_values[p_diff_values.size()/2];
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); // LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
const double p_diff_rms_val = sqrt(p_diff_mse.first); const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
const double same_top_p = 1.0*kld.n_same_top/kld.count; const double same_top_p = 1.0*kld.n_same_top/kld.count;
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -1966,16 +1959,18 @@ int main(int argc, char ** argv) {
params.n_ctx = 512; params.n_ctx = 512;
params.logits_all = true; params.logits_all = true;
params.escape = false;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
const int32_t n_ctx = params.n_ctx; const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) { if (n_ctx <= 0) {
fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__); LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
return 1; return 1;
} }
@ -2000,21 +1995,11 @@ int main(int argc, char ** argv) {
} }
if (params.ppl_stride > 0) { if (params.ppl_stride > 0) {
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
params.n_ctx, params.n_ctx + params.ppl_stride/2); params.n_ctx, params.n_ctx + params.ppl_stride/2);
params.n_ctx += params.ppl_stride/2; params.n_ctx += params.ppl_stride/2;
} }
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -2024,21 +2009,21 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) { if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx); __func__, n_ctx_train, params.n_ctx);
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_INF("\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
} }
struct results_perplexity results; struct results_perplexity results;
@ -2054,7 +2039,9 @@ int main(int argc, char ** argv) {
results = perplexity(ctx, params, n_ctx); results = perplexity(ctx, params, n_ctx);
} }
llama_print_timings(ctx); LOG("\n");
llama_perf_context_print(ctx);
write_logfile(ctx, params, model, results); write_logfile(ctx, params, model, results);
llama_free(ctx); llama_free(ctx);

View file

@ -1,7 +1,7 @@
#define LLAMA_API_INTERNAL
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#include "llama-impl.h"
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
} }
auto cparams = llama_context_default_params(); auto cparams = llama_context_default_params();
cparams.n_ctx = 256; cparams.n_ctx = 256;
cparams.seed = 1;
ctx = llama_new_context_with_model(model, cparams); ctx = llama_new_context_with_model(model, cparams);

View file

@ -1,6 +1,6 @@
set(TARGET llama-quantize) set(TARGET llama-quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -34,7 +34,7 @@ Run the quantized model:
```bash ```bash
# start inference on a gguf model # start inference on a gguf model
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 ./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
``` ```
When running the larger models, make sure you have enough disk space to store all the intermediate files. When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
Several quantization methods are supported. They differ in the resulting model disk size and inference speed. Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
*(outdated)* *(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |

View file

@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
@ -61,6 +63,16 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
static bool striequals(const char * a, const char * b) {
while (*a && *b) {
if (std::tolower(*a) != std::tolower(*b)) {
return false;
}
a++; b++;
}
return *a == *b;
}
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str; std::string ftype_str;
@ -68,7 +80,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
ftype_str.push_back(std::toupper(ch)); ftype_str.push_back(std::toupper(ch));
} }
for (auto & it : QUANT_OPTIONS) { for (auto & it : QUANT_OPTIONS) {
if (it.name == ftype_str) { if (striequals(it.name.c_str(), ftype_str.c_str())) {
ftype = it.ftype; ftype = it.ftype;
ftype_str_out = it.name; ftype_str_out = it.name;
return true; return true;
@ -104,7 +116,7 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --keep-split: will generate quatized model in the same shards as input"); printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@ -223,15 +235,15 @@ static int prepare_imatrix(const std::string & imatrix_file,
} }
static ggml_type parse_ggml_type(const char * arg) { static ggml_type parse_ggml_type(const char * arg) {
ggml_type result = GGML_TYPE_COUNT; for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
for (int j = 0; j < GGML_TYPE_COUNT; ++j) { auto type = (ggml_type)i;
auto type = ggml_type(j);
const auto * name = ggml_type_name(type); const auto * name = ggml_type_name(type);
if (name && strcmp(arg, name) == 0) { if (name && striequals(name, arg)) {
result = type; break; return type;
} }
} }
return result; fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
return GGML_TYPE_COUNT;
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -252,12 +264,18 @@ int main(int argc, char ** argv) {
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
if (arg_idx < argc-1) { if (arg_idx < argc-1) {
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
if (params.output_tensor_type == GGML_TYPE_COUNT) {
usage(argv[0]);
}
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) { } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
if (arg_idx < argc-1) { if (arg_idx < argc-1) {
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]); params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
if (params.token_embedding_type == GGML_TYPE_COUNT) {
usage(argv[0]);
}
} else { } else {
usage(argv[0]); usage(argv[0]);
} }

View file

@ -1,15 +1,16 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <iostream> // TODO: remove me
static void print_usage(int argc, char ** argv, const gpt_params & params) { static void print_usage(int, char ** argv) {
gpt_params_print_usage(argc, argv, params); LOG("\nexample usage:\n");
LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
LOG_TEE("\nexample usage:\n"); LOG("\n");
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
LOG_TEE("\n");
} }
struct chunk { struct chunk {
@ -18,7 +19,7 @@ struct chunk {
// original file position // original file position
size_t filepos; size_t filepos;
// original text data // original text data
std::string textdata = ""; std::string textdata;
// tokenized text data // tokenized text data
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
// embedding // embedding
@ -32,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
std::ifstream f(filename.c_str()); std::ifstream f(filename.c_str());
if (!f.is_open()) { if (!f.is_open()) {
fprintf(stderr, "Error: could not open file %s\n", filename.c_str()); LOG_ERR("could not open file %s\n", filename.c_str());
return chunks; return chunks;
} }
chunk current_chunk; chunk current_chunk;
char buffer[1024]; char buffer[1024];
int64_t filepos = 0; int64_t filepos = 0;
std::string current = ""; std::string current;
while (f.read(buffer, 1024)) { while (f.read(buffer, 1024)) {
current += std::string(buffer, f.gcount()); current += std::string(buffer, f.gcount());
size_t pos; size_t pos;
@ -85,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
// run model // run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) { if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__); LOG_ERR("%s : failed to decode\n", __func__);
} }
for (int i = 0; i < batch.n_tokens; i++) { for (int i = 0; i < batch.n_tokens; i++) {
@ -100,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
if (embd == NULL) { if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i); embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) { if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
continue; continue;
} }
} }
@ -113,29 +114,28 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
print_usage(argc, argv, params);
return 1; return 1;
} }
gpt_init();
// For BERT models, batch size must be equal to ubatch size // For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch; params.n_ubatch = params.n_batch;
params.embedding = true; params.embedding = true;
if (params.chunk_size <= 0) { if (params.chunk_size <= 0) {
fprintf(stderr, "chunk_size must be positive\n"); LOG_ERR("chunk_size must be positive\n");
return 1; return 1;
} }
if (params.context_files.empty()) { if (params.context_files.empty()) {
fprintf(stderr, "context_files must be specified\n"); LOG_ERR("context_files must be specified\n");
return 1; return 1;
} }
print_build_info(); LOG_INF("processing files:\n");
printf("processing files:\n");
for (auto & context_file : params.context_files) { for (auto & context_file : params.context_files) {
printf("%s\n", context_file.c_str()); LOG_INF("%s\n", context_file.c_str());
} }
std::vector<chunk> chunks; std::vector<chunk> chunks;
@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
} }
printf("Number of chunks: %ld\n", chunks.size()); LOG_INF("Number of chunks: %ld\n", chunks.size());
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);
return 1; return 1;
} }
@ -164,19 +164,19 @@ int main(int argc, char ** argv) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) { if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); LOG_ERR("%s: pooling type NONE not supported\n", __func__);
return 1; return 1;
} }
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx); __func__, n_ctx_train, n_ctx);
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_INF("\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
} }
// max batch size // max batch size
@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
for (auto & chunk : chunks) { for (auto & chunk : chunks) {
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
if (inp.size() > n_batch) { if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch); __func__, (long long int) inp.size(), (long long int) n_batch);
return 1; return 1;
} }
@ -201,12 +201,12 @@ int main(int argc, char ** argv) {
// tokenization stats // tokenization stats
if (params.verbose_prompt) { if (params.verbose_prompt) {
for (int i = 0; i < (int) chunks.size(); i++) { for (int i = 0; i < (int) chunks.size(); i++) {
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
} }
fprintf(stderr, "\n\n"); LOG_INF("\n\n");
} }
} }
@ -253,14 +253,15 @@ int main(int argc, char ** argv) {
chunks[i].tokens.clear(); chunks[i].tokens.clear();
} }
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
// start loop, receive query and return top k similar chunks based on cosine similarity // start loop, receive query and return top k similar chunks based on cosine similarity
std::string query; std::string query;
while (true) { while (true) {
printf("Enter query: "); LOG("Enter query: ");
std::getline(std::cin, query); std::getline(std::cin, query);
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true); std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
batch_add_seq(query_batch, query_tokens, 0); batch_add_seq(query_batch, query_tokens, 0);
std::vector<float> query_emb(n_embd, 0); std::vector<float> query_emb(n_embd, 0);
@ -281,19 +282,22 @@ int main(int argc, char ** argv) {
return a.second > b.second; return a.second > b.second;
}); });
printf("Top %d similar chunks:\n", params.sparams.top_k); LOG("Top %d similar chunks:\n", params.sparams.top_k);
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
printf("filename: %s\n", chunks[similarities[i].first].filename.c_str()); LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
printf("similarity: %f\n", similarities[i].second); LOG("similarity: %f\n", similarities[i].second);
printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
printf("--------------------\n"); LOG("--------------------\n");
} }
} }
} }
LOG("\n");
llama_perf_context_print(ctx);
// clean up // clean up
llama_print_timings(ctx); llama_batch_free(query_batch);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();

View file

@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following
```mermaid ```mermaid
flowchart TD flowchart TD
rpcb---|TCP|srva rpcb<-->|TCP|srva
rpcb---|TCP|srvb rpcb<-->|TCP|srvb
rpcb-.-|TCP|srvn rpcb<-.->|TCP|srvn
subgraph hostn[Host N] subgraph hostn[Host N]
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"] srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
end end
subgraph hostb[Host B] subgraph hostb[Host B]
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"] srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
end end
subgraph hosta[Host A] subgraph hosta[Host A]
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"] srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
end end
subgraph host[Main Host] subgraph host[Main Host]
ggml[llama.cpp]---rpcb[RPC backend] local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
ggml[llama-cli]<-->rpcb[RPC backend]
end end
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5 style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
``` ```
@ -62,17 +63,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
On the main host build `llama.cpp` only with `-DGGML_RPC=ON`: On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash
mkdir build-rpc
cd build-rpc
cmake .. -DGGML_RPC=ON
cmake --build . --config Release
```
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash ```bash
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
``` ```
This way you can offload model layers to both local and remote devices.

Some files were not shown because too many files have changed in this diff Show more