Merge branch 'master' into pr/8917

This commit is contained in:
Nexesenex 2024-10-10 02:33:59 +02:00
commit ed78de20e4
297 changed files with 44760 additions and 25973 deletions

View file

@ -1,18 +1,16 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
@ -24,13 +22,12 @@ WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make -j$(nproc)
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \
cp build/bin/* .
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
@ -34,7 +34,7 @@ WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang

View file

@ -0,0 +1,44 @@
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
FROM cosdt/cann:$ASCEND_VERSION AS build
WORKDIR /app
COPY . .
RUN yum install -y gcc g++ cmake make
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
# find libascend_hal.so, because the drive hasn`t been mounted.
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
# TODO: use image with NNRT
FROM cosdt/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
ENTRYPOINT ["/llama-cli" ]

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git
apt-get install -y build-essential git cmake
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
RUN make -j$(nproc) llama-cli
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ]

View file

@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
@ -34,7 +34,7 @@ WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
@ -8,31 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/llama-server /llama-server
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -26,6 +26,8 @@ RUN apt-get update && \
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
@ -34,11 +34,13 @@ WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
# Enable cURL
ENV LLAMA_CURL=1

View file

@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -21,6 +21,8 @@ RUN apt-get update && \
COPY --from=build /app/llama-server /llama-server
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -1,13 +1,52 @@
{ inputs, ... }:
{
perSystem =
{ config, lib, ... }:
{
config,
lib,
system,
...
}:
{
devShells =
lib.concatMapAttrs
(name: package: {
${name} = package.passthru.shell;
${name + "-extra"} = package.passthru.shell-extra;
})
config.packages;
let
pkgs = import inputs.nixpkgs { inherit system; };
stdenv = pkgs.stdenv;
scripts = config.packages.python-scripts;
in
lib.pipe (config.packages) [
(lib.concatMapAttrs (
name: package: {
${name} = pkgs.mkShell {
name = "${name}";
inputsFrom = [ package ];
shellHook = ''
echo "Entering ${name} devShell"
'';
};
"${name}-extra" =
if (name == "python-scripts") then
null
else
pkgs.mkShell {
name = "${name}-extra";
inputsFrom = [
package
scripts
];
# Extra packages that *may* be used by some scripts
packages = [
pkgs.python3Packages.tiktoken
];
shellHook = ''
echo "Entering ${name} devShell"
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
'';
};
}
))
(lib.filterAttrs (name: value: value != null))
];
};
}

View file

@ -26,16 +26,14 @@
config.cudaSupport = true;
config.allowUnfreePredicate =
p:
builtins.all
(
builtins.all (
license:
license.free
|| builtins.elem license.shortName [
"CUDA EULA"
"cuDNN EULA"
]
)
(p.meta.licenses or [ p.meta.license ]);
) (p.meta.licenses or [ p.meta.license ]);
};
# Ensure dependencies use ROCm consistently
pkgsRocm = import inputs.nixpkgs {

View file

@ -0,0 +1,36 @@
{
lib,
llamaVersion,
numpy,
tqdm,
sentencepiece,
pyyaml,
poetry-core,
buildPythonPackage,
pytestCheckHook,
}:
buildPythonPackage {
pname = "gguf";
version = llamaVersion;
pyproject = true;
nativeBuildInputs = [ poetry-core ];
propagatedBuildInputs = [
numpy
tqdm
sentencepiece
pyyaml
];
src = lib.cleanSource ../../gguf-py;
pythonImportsCheck = [
"numpy"
"gguf"
];
nativeCheckInputs = [ pytestCheckHook ];
doCheck = true;
meta = with lib; {
description = "Python package for writing binary files in the GGUF format";
license = licenses.mit;
maintainers = [ maintainers.ditsuke ];
};
}

View file

@ -3,31 +3,33 @@
glibc,
config,
stdenv,
mkShell,
runCommand,
cmake,
ninja,
pkg-config,
git,
python3,
mpi,
blas,
cudaPackages,
autoAddDriverRunpath,
darwin,
rocmPackages,
vulkan-headers,
vulkan-loader,
curl,
shaderc,
useBlas ? builtins.all (x: !x) [
useBlas ?
builtins.all (x: !x) [
useCuda
useMetalKit
useRocm
useVulkan
] && blas.meta.available,
]
&& blas.meta.available,
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
useMpi ? false, # Increases the runtime closure size by ~700M
# Increases the runtime closure size by ~700M
useMpi ? false,
useRocm ? config.rocmSupport,
enableCurl ? true,
useVulkan ? false,
@ -37,8 +39,8 @@
# otherwise we get libstdc++ errors downstream.
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
precompileMetalShaders ? false
}@inputs:
precompileMetalShaders ? false,
}:
let
inherit (lib)
@ -46,7 +48,6 @@ let
cmakeFeature
optionals
strings
versionOlder
;
stdenv = throw "Use effectiveStdenv instead";
@ -62,54 +63,11 @@ let
pnameSuffix =
strings.optionalString (suffices != [ ])
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
descriptionSuffix =
strings.optionalString (suffices != [ ])
", accelerated with ${strings.concatStringsSep ", " suffices}";
descriptionSuffix = strings.optionalString (
suffices != [ ]
) ", accelerated with ${strings.concatStringsSep ", " suffices}";
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
# TODO: package the Python in this repository in a Nix-like way.
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
# https://peps.python.org/pep-0517/
#
# TODO: Package up each Python script or service appropriately, by making
# them into "entrypoints"
llama-python = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
]
);
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
llama-python-extra = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
ps.tiktoken
ps.torchWithoutCuda
ps.transformers
# server bench
ps.matplotlib
# server tests
ps.openai
ps.behave
ps.prometheus-client
# for examples/pydantic-models-to-grammar-examples.py
ps.docstring-parser
ps.pydantic
# for scripts/compare-llama-bench.py
ps.gitpython
ps.tabulate
]
);
xcrunHost = runCommand "xcrunHost" {} ''
xcrunHost = runCommand "xcrunHost" { } ''
mkdir -p $out/bin
ln -s /usr/bin/xcrun $out/bin
'';
@ -144,8 +102,7 @@ let
];
in
effectiveStdenv.mkDerivation (
finalAttrs: {
effectiveStdenv.mkDerivation (finalAttrs: {
pname = "llama-cpp${pnameSuffix}";
version = llamaVersion;
@ -193,15 +150,10 @@ effectiveStdenv.mkDerivation (
++ optionals useCuda [
cudaPackages.cuda_nvcc
# TODO: Replace with autoAddDriverRunpath
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
cudaPackages.autoAddOpenGLRunpathHook
autoAddDriverRunpath
]
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
glibc.static
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
xcrunHost
];
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
buildInputs =
optionals effectiveStdenv.isDarwin darwinBuildInputs
@ -256,35 +208,6 @@ effectiveStdenv.mkDerivation (
cp $src/include/llama.h $out/include/
'';
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
passthru = {
inherit
useBlas
useCuda
useMetalKit
useMpi
useRocm
useVulkan
;
shell = mkShell {
name = "shell-${finalAttrs.finalPackage.name}";
description = "contains numpy and sentencepiece";
buildInputs = [ llama-python ];
inputsFrom = [ finalAttrs.finalPackage ];
shellHook = ''
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
'';
};
shell-extra = mkShell {
name = "shell-extra-${finalAttrs.finalPackage.name}";
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
buildInputs = [ llama-python-extra ];
inputsFrom = [ finalAttrs.finalPackage ];
};
};
meta = {
# Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because
@ -320,5 +243,4 @@ effectiveStdenv.mkDerivation (
# Extend `badPlatforms` instead
platforms = lib.platforms.all;
};
}
)
})

View file

@ -0,0 +1,66 @@
{
lib,
stdenv,
buildPythonPackage,
poetry-core,
mkShell,
python3Packages,
gguf-py,
}@inputs:
let
llama-python-deps = with python3Packages; [
numpy
sentencepiece
transformers
protobuf
torchWithoutCuda
gguf-py
tqdm
# for scripts/compare-llama-bench.py
gitpython
tabulate
# for examples/pydantic-models-to-grammar-examples.py
docstring-parser
pydantic
];
llama-python-test-deps = with python3Packages; [
# Server bench
matplotlib
# server tests
openai
behave
prometheus-client
];
in
buildPythonPackage ({
pname = "llama-scripts";
version = "0.0.0";
pyproject = true;
# NOTE: The files filtered out here are not visible in the build sandbox, neither
# do they affect the output hash. They can be modified without triggering a rebuild.
src = lib.cleanSourceWith {
filter =
name: type:
let
any = builtins.any (x: x);
baseName = builtins.baseNameOf name;
in
any [
(lib.hasSuffix ".py" name)
(baseName == "README.md")
(baseName == "pyproject.toml")
];
src = lib.cleanSource ../../.;
};
nativeBuildInputs = [ poetry-core ];
nativeCheckInputs = llama-python-test-deps;
dependencies = llama-python-deps;
})

View file

@ -1,19 +1,41 @@
{
lib,
newScope,
python3,
llamaVersion ? "0.0.0",
}:
let
pythonPackages = python3.pkgs;
buildPythonPackage = pythonPackages.buildPythonPackage;
numpy = pythonPackages.numpy;
tqdm = pythonPackages.tqdm;
sentencepiece = pythonPackages.sentencepiece;
pyyaml = pythonPackages.pyyaml;
poetry-core = pythonPackages.poetry-core;
pytestCheckHook = pythonPackages.pytestCheckHook;
in
# We're using `makeScope` instead of just writing out an attrset
# because it allows users to apply overlays later using `overrideScope'`.
# Cf. https://noogle.dev/f/lib/makeScope
lib.makeScope newScope (
self: {
lib.makeScope newScope (self: {
inherit llamaVersion;
gguf-py = self.callPackage ./package-gguf-py.nix {
inherit
buildPythonPackage
numpy
tqdm
sentencepiece
poetry-core
pyyaml
pytestCheckHook
;
};
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
llama-cpp = self.callPackage ./package.nix { };
docker = self.callPackage ./docker.nix { };
docker-min = self.callPackage ./docker.nix { interactive = false; };
sif = self.callPackage ./sif.nix { };
}
)
})

View file

@ -1,7 +1,7 @@
*.o
*.a
.cache/
.git/
# Do not ignore .git directory, otherwise the reported build number will always be 0
.github/
.gitignore
.vs/

2
.ecrc
View file

@ -1,5 +1,5 @@
{
"Exclude": ["^\\.gitmodules$"],
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
"Disable": {
"IndentSize": true
}

View file

@ -1,3 +1,6 @@
# TODO: there have been some issues with the workflow, so disabling for now
# https://github.com/ggerganov/llama.cpp/issues/7893
#
# Benchmark
name: Benchmark
@ -24,10 +27,10 @@ on:
push:
branches:
- master
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
pull_request_target:
types: [opened, synchronize, reopened]
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
schedule:
- cron: '04 2 * * *'
@ -129,6 +132,8 @@ jobs:
- name: Server bench
id: server_bench
env:
HEAD_REF: ${{ github.head_ref || github.ref_name }}
run: |
set -eux
@ -137,7 +142,7 @@ jobs:
python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \
--branch $HEAD_REF \
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
--scenario script.js \
--duration ${{ github.event.inputs.duration || env.DURATION }} \

View file

@ -19,10 +19,18 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
contents: write # for creating release
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
macOS-latest-cmake-arm64:
@ -47,7 +55,7 @@ jobs:
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@ -105,7 +113,7 @@ jobs:
sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@ -222,7 +230,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake --build . --config Release -j $(nproc)
- name: Test
@ -375,7 +383,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Dependencies
id: depends
@ -401,7 +409,7 @@ jobs:
continue-on-error: true
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: add oneAPI to apt
shell: bash
@ -442,7 +450,7 @@ jobs:
continue-on-error: true
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: add oneAPI to apt
shell: bash
@ -546,7 +554,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v4
- name: Dependencies
id: depends
@ -576,7 +584,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v4
- name: Dependencies
id: depends
@ -610,7 +618,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v4
- name: Dependencies
id: depends
@ -696,22 +704,20 @@ jobs:
strategy:
matrix:
include:
- build: 'rpc-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64'
@ -859,7 +865,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
@ -955,6 +961,7 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
@ -966,19 +973,20 @@ jobs:
name: llama-bin-win-sycl-x64.zip
windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }}
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
@ -993,8 +1001,72 @@ jobs:
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
cmake --build build --config Release
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
windows-latest-cmake-hip-release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: windows-latest
strategy:
matrix:
gpu_target: [gfx1100, gfx1101, gfx1030]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
ios-xcode-build:
runs-on: macos-latest
@ -1059,6 +1131,7 @@ jobs:
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cuda
- windows-latest-cmake-hip-release
- macOS-latest-cmake-arm64
- macOS-latest-cmake-x64

View file

@ -3,6 +3,11 @@ on:
schedule:
- cron: "42 0 * * *"
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
issues: write
jobs:
close-issues:
runs-on: ubuntu-latest

View file

@ -15,11 +15,17 @@ on:
branches:
- master
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
workflow_dispatch: # allows manual triggering, useful for debugging
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
packages: write
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
@ -37,15 +43,17 @@ jobs:
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v4
with:
fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
@ -60,6 +68,34 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
# determine tag name postfix (build number, commit hash)
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
TAG_POSTFIX="b${BUILD_NUMBER}"
else
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
fi
# list all tags possible
TAGS=""
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
echo "output_tags=$TAGS" # print out for debugging
env:
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
@ -77,40 +113,13 @@ jobs:
docker-images: true
swap-storage: true
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Downcase github.repository_owner
run: |
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Build and push Docker image (versioned)
- name: Build and push Docker image (tagged + versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
# tag list is generated from step above
tags: ${{ steps.tag.outputs.output_tags }}
file: ${{ matrix.config.dockerfile }}

View file

@ -21,6 +21,13 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
id-token: write
contents: read
jobs:
nix-build-aarch64:
runs-on: ubuntu-latest

View file

@ -12,6 +12,13 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
id-token: write
contents: read
jobs:
nix-eval:
strategy:

View file

@ -6,15 +6,13 @@ on:
- '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh'
- 'convert*.py'
- 'requirements.txt'
- 'requirements/*.txt'
- '**/requirements*.txt'
pull_request:
paths:
- '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh'
- 'convert*.py'
- 'requirements.txt'
- 'requirements/*.txt'
- '**/requirements*.txt'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

View file

@ -4,11 +4,13 @@ on:
push:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- '**.py'
- '**/requirements*.txt'
pull_request:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- '**.py'
- '**/requirements*.txt'
@ -33,6 +35,6 @@ jobs:
- name: Type-check with Pyright
uses: jakebailey/pyright-action@v2
with:
version: 1.1.370
version: 1.1.382
level: warning
warnings: true

View file

@ -20,6 +20,12 @@ on:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
env:
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
@ -173,6 +179,7 @@ jobs:
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: |
cd examples/server/tests
$env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests

5
.gitignore vendored
View file

@ -61,6 +61,7 @@ llama-batched-swift
/rpc-server
out/
tmp/
autogen-*.md
# Deprecated
@ -79,7 +80,6 @@ models-mnt
!models/ggml-vocab-*.gguf*
# Zig
zig-out/
zig-cache/
@ -130,3 +130,6 @@ poetry.toml
# Scripts
!/scripts/install-oneapi.bat
# Test models for lora adapters
/lora-tests

View file

@ -62,6 +62,9 @@ option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# utils
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
# extra artifacts
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@ -82,11 +85,11 @@ set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
# change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE ON)
set(GGML_LLAMAFILE_DEFAULT ON)
endif()
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
set(GGML_CUDA_USE_GRAPHS ON)
if (NOT DEFINED GGML_CUDA_GRAPHS)
set(GGML_CUDA_GRAPHS_DEFAULT ON)
endif()
# transition helpers
@ -139,10 +142,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
# determining _precisely_ which defines are necessary for the llama-config
# package.
#
set(GGML_TRANSIENT_DEFINES)
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
if (GGML_DIR_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
endif()
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
if (GGML_TARGET_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
@ -185,17 +194,19 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION lib/pkgconfig)
#
# programs, examples and tests
# utils, programs, examples and tests
#
add_subdirectory(common)
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif ()
endif()
if (LLAMA_BUILD_EXAMPLES)
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()

View file

@ -28,11 +28,12 @@
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
{
"name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"toolset": { "value": "host=x64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
}
@ -41,7 +42,7 @@
{
"name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"toolset": { "value": "host=x64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
}
@ -60,6 +61,8 @@
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
]
}

View file

@ -1,24 +1,23 @@
# Pull requests (for contributors)
- Test your changes:
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- Consider allowing write access to your branch for faster review
- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
# Pull requests (for collaborators)
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
# Coding guidelines
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
@ -27,3 +26,8 @@
![matmul](media/matmul.png)
# Resources
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
https://github.com/ggerganov/llama.cpp/projects

113
Makefile
View file

@ -5,7 +5,6 @@ BUILD_TARGETS = \
llama-batched \
llama-batched-bench \
llama-bench \
llama-benchmark-matmult \
llama-cli \
llama-convert-llama2c-to-ggml \
llama-embedding \
@ -19,6 +18,7 @@ BUILD_TARGETS = \
llama-imatrix \
llama-infill \
llama-llava-cli \
llama-minicpmv-cli\
llama-lookahead \
llama-lookup \
llama-lookup-create \
@ -38,10 +38,12 @@ BUILD_TARGETS = \
llama-tokenize \
llama-vdot \
llama-cvector-generator \
llama-gen-docs \
tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
tests/test-arg-parser \
tests/test-autorelease \
tests/test-backend-ops \
tests/test-chat-template \
@ -51,6 +53,7 @@ TEST_TARGETS = \
tests/test-grammar-parser \
tests/test-json-schema-to-grammar \
tests/test-llama-grammar \
tests/test-log \
tests/test-model-load-cancel \
tests/test-opt \
tests/test-quantize-fns \
@ -64,7 +67,7 @@ TEST_TARGETS = \
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
@ -145,6 +148,14 @@ GGML_NO_METAL := 1
DEPRECATE_WARNING := 1
endif
ifdef LLAMA_DISABLE_LOGS
REMOVE_WARNING := 1
endif
ifdef LLAMA_SERVER_VERBOSE
REMOVE_WARNING := 1
endif
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
@ -348,19 +359,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED
MK_LDFLAGS += -fsanitize=undefined -g
endif
ifdef LLAMA_SERVER_VERBOSE
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
ifdef LLAMA_SERVER_SSL
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
MK_LDFLAGS += -lssl -lcrypto
endif
ifdef LLAMA_DISABLE_LOGS
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS
# warnings
WARN_FLAGS = \
-Wall \
@ -431,7 +434,7 @@ endif
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
ifndef RISCV
ifndef RISCV_CROSS_COMPILE
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available:
@ -511,7 +514,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
MK_CXXFLAGS += -mlasx
endif
else
ifneq ($(filter riscv64%,$(UNAME_M)),)
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
else # RISC-V CROSS COMPILATION
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
@ -602,7 +610,7 @@ ifdef GGML_CUDA
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
else
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
@ -610,7 +618,7 @@ ifdef GGML_CUDA
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math
endif # GGML_MUSA
@ -762,6 +770,10 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
endif
ifdef GGML_VULKAN_PERF
MK_CPPFLAGS += -DGGML_VULKAN_PERF
endif
ifdef GGML_VULKAN_VALIDATE
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
endif
@ -918,11 +930,12 @@ OBJ_LLAMA = \
OBJ_COMMON = \
common/common.o \
common/arg.o \
common/log.o \
common/console.o \
common/ngram-cache.o \
common/sampling.o \
common/train.o \
common/grammar-parser.o \
common/build-info.o \
common/json-schema-to-grammar.o
@ -1015,6 +1028,14 @@ $(info - LLAMA_NO_CCACHE)
$(info )
endif
ifdef REMOVE_WARNING
$(info !!! REMOVAL WARNING !!!)
$(info The following LLAMA_ options have been removed and are no longer supported)
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
$(info )
endif
#
# Build libraries
#
@ -1033,10 +1054,11 @@ ggml/src/ggml-alloc.o: \
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-backend.o: \
ggml/src/ggml-backend.c \
ggml/src/ggml-backend.cpp \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml.h \
ggml/include/ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml/src/ggml-quants.o: \
ggml/src/ggml-quants.c \
@ -1151,6 +1173,16 @@ common/common.o: \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/arg.o: \
common/arg.cpp \
common/arg.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/log.o: \
common/log.cpp \
common/log.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/sampling.o: \
common/sampling.cpp \
common/sampling.h \
@ -1162,11 +1194,6 @@ common/console.o: \
common/console.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/grammar-parser.o: \
common/grammar-parser.cpp \
common/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/json-schema-to-grammar.o: \
common/json-schema-to-grammar.cpp \
common/json-schema-to-grammar.h
@ -1206,6 +1233,7 @@ clean:
rm -rvf ggml/*.dll
rm -rvf ggml/*.so
rm -vrf ggml/src/*.o
rm -rvf ggml/src/llamafile/*.o
rm -rvf common/build-info.cpp
rm -vrf ggml/src/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-cuda/*.o
@ -1333,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
$(OBJ_GGML) $(OBJ_LLAMA)
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1427,6 +1455,7 @@ llama-server: \
examples/server/system-prompts.js.hpp \
examples/server/prompt-formats.js.hpp \
examples/server/json-schema-to-grammar.mjs.hpp \
examples/server/loading.html.hpp \
common/json.hpp \
common/stb_image.h \
$(OBJ_ALL)
@ -1442,6 +1471,11 @@ examples/server/%.hpp: examples/server/public/% Makefile
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@
llama-gen-docs: examples/gen-docs/gen-docs.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
libllava.a: examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
@ -1452,15 +1486,20 @@ libllava.a: examples/llava/llava.cpp \
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llama-llava-cli: examples/llava/llava-cli.cpp \
examples/llava/clip.h \
examples/llava/clip.cpp \
examples/llava/llava.h \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
@ -1484,21 +1523,21 @@ common/build-info.o: common/build-info.cpp
tests: $(TEST_TARGETS)
llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \
$(OBJ_GGML) common/build-info.o
tests/test-arg-parser: tests/test-arg-parser.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
run-benchmark-matmult: llama-benchmark-matmult
./$@
.PHONY: run-benchmark-matmult swift
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-log: tests/test-log.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

View file

@ -11,7 +11,7 @@ var sources = [
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-backend.cpp",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-aarch64.c",
]

View file

@ -10,32 +10,15 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
## Recent API changes
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
## Hot topics
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
----
@ -95,6 +78,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
@ -105,6 +89,10 @@ Typically finetunes of the base models below are supported as well.
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
@ -126,6 +114,7 @@ Typically finetunes of the base models below are supported as well.
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
@ -179,17 +168,23 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
**Tools:**
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) download models from the Ollama library to be used directly with llama.cpp
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
**Infrastructure:**
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
@ -422,6 +417,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU |
| [CANN](./docs/build.md#cann) | Ascend NPU |
## Tools
@ -449,7 +445,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated!
- Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)

120
ci/run.sh
View file

@ -1,4 +1,4 @@
#/bin/bash
#!/bin/bash
#
# sample usage:
#
@ -13,6 +13,9 @@
# # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
## helpers
# download a file if it does not exist or if it is outdated
@ -107,7 +114,7 @@ function gg_run_ctest_debug {
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -138,7 +145,7 @@ function gg_run_ctest_release {
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
}
# open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_open_llama_7b_v2 {
cd ${SRC}
@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
}
# pythia_2_8b
# requires: GG_BUILD_CUDA
function gg_run_pythia_2_8b {
cd ${SRC}
@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -707,6 +712,82 @@ function gg_run_embd_bge_small {
set +e
}
function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
}
# rerank_tiny
function gg_run_rerank_tiny {
cd ${SRC}
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
path_models="../models-mnt/rerank-tiny"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
# for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029
# rerank score 1: 0.029
# rerank score 2: 0.135
# check that the score is in the range [$3, $4]
function check_score {
qnt="$1"
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$score"
return 0
}
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
set +e
}
function gg_sum_rerank_tiny {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Rerank Tiny (Jina):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
}
function gg_check_build_requirements {
if ! command -v cmake &> /dev/null; then
gg_printf 'cmake not found, please install'
@ -721,17 +802,11 @@ function gg_check_build_requirements {
fi
}
function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
}
## main
export LLAMA_LOG_PREFIX=1
export LLAMA_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
rm -rf ${SRC}/models-mnt
@ -754,6 +829,7 @@ test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
test $ret -eq 0 && gg_run rerank_tiny
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
@ -761,7 +837,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
test $ret -eq 0 && gg_run pythia_1_4b
else
test $ret -eq 0 && gg_run pythia_2_8b

View file

@ -51,21 +51,23 @@ endif()
set(TARGET common)
add_library(${TARGET} STATIC
arg.cpp
arg.h
base64.hpp
common.h
common.cpp
sampling.h
sampling.cpp
console.h
common.h
console.cpp
grammar-parser.h
grammar-parser.cpp
json.hpp
console.h
json-schema-to-grammar.cpp
train.h
train.cpp
ngram-cache.h
json.hpp
log.cpp
log.h
ngram-cache.cpp
ngram-cache.h
sampling.cpp
sampling.h
train.cpp
train.h
)
if (BUILD_SHARED_LIBS)

2020
common/arg.cpp Normal file

File diff suppressed because it is too large Load diff

77
common/arg.h Normal file
View file

@ -0,0 +1,77 @@
#pragma once
#include "common.h"
#include <set>
#include <string>
#include <vector>
//
// CLI argument parsing
//
struct llama_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr;
std::string help;
bool is_sparam = false; // is current arg a sampling param?
void (*handler_void) (gpt_params & params) = nullptr;
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (gpt_params & params, int) = nullptr;
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &)
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, int)
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const std::string & help,
void (*handler)(gpt_params & params)
) : args(args), help(help), handler_void(handler) {}
// support 2 values for arg
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const char * value_hint_2,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &, const std::string &)
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
llama_arg & set_env(const char * env);
llama_arg & set_sparam();
bool in_example(enum llama_example ex);
bool get_value_from_env(std::string & output);
bool has_value_from_env();
std::string to_string();
};
struct gpt_params_context {
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
gpt_params & params;
std::vector<llama_arg> options;
void(*print_usage)(int, char **) = nullptr;
gpt_params_context(gpt_params & params) : params(params) {}
};
// parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
// function to be used by test-arg-parser
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

File diff suppressed because it is too large Load diff

View file

@ -4,18 +4,9 @@
#include "llama.h"
#include "sampling.h"
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
#include <cmath>
#include <string>
#include <vector>
#include <random>
#include <thread>
#include <unordered_map>
#include <tuple>
#include <sstream>
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
@ -54,26 +45,103 @@ struct llama_control_vector_load_info;
// CPU utils
//
struct cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
//
// CLI argument parsing
// Common params
//
enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_INFILL,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
LLAMA_EXAMPLE_PASSKEY,
LLAMA_EXAMPLE_IMATRIX,
LLAMA_EXAMPLE_BENCH,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_COUNT,
};
enum gpt_sampler_type {
GPT_SAMPLER_TYPE_NONE = 0,
GPT_SAMPLER_TYPE_TOP_K = 1,
GPT_SAMPLER_TYPE_TOP_P = 2,
GPT_SAMPLER_TYPE_MIN_P = 3,
GPT_SAMPLER_TYPE_TFS_Z = 4,
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
};
// dimensionality reduction methods, used by cvector-generator
enum dimre_method {
DIMRE_METHOD_PCA,
DIMRE_METHOD_MEAN,
};
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
// sampler parameters
struct gpt_sampler_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics
std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
GPT_SAMPLER_TYPE_TFS_Z,
GPT_SAMPLER_TYPE_TYPICAL_P,
GPT_SAMPLER_TYPE_TOP_P,
GPT_SAMPLER_TYPE_MIN_P,
GPT_SAMPLER_TYPE_TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string
std::string print() const;
};
struct gpt_params {
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -100,6 +168,11 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
@ -110,26 +183,25 @@ struct gpt_params {
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
// // sampling parameters
struct llama_sampling_params sparams;
struct gpt_sampler_params sparams;
std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
std::string hf_token = ""; // HF token
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string prompt = "";
std::string prompt_file = ""; // store the external prompt file name
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string logdir = ""; // directory in which to save YAML log files
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
std::string logits_file = ""; // file for saving *all* logits
std::string rpc_servers = ""; // comma separated list of RPC servers
std::string model = ""; // model path // NOLINT
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
std::string model_alias = "unknown"; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name // NOLINT
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@ -173,15 +245,15 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
@ -191,7 +263,7 @@ struct gpt_params {
std::string cache_type_v = "f16"; // KV cache data type for the V
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::string mmproj = ""; // path to multimodal projector // NOLINT
std::vector<std::string> image; // path to image file(s)
// embedding
@ -199,25 +271,29 @@ struct gpt_params {
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embendings
bool reranking = false; // enable reranking support on server
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1";
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
std::string public_path = ""; // NOLINT
std::string chat_template = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
bool enable_chat_template = true;
std::vector<std::string> api_keys;
std::string ssl_file_key = "";
std::string ssl_file_cert = "";
std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; // NOLINT
bool endpoint_slots = true;
// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
bool log_json = false;
@ -265,18 +341,22 @@ struct gpt_params {
bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
// batched-bench params
bool batched_bench_output_jsonl = false;
};
void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
// call once at the start of a program if it uses libcommon
// initializes the logging system and prints info about the build
void gpt_init();
std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);
//
// String utils
//
@ -286,6 +366,8 @@ std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
template<class T>
static std::vector<T> string_split(const std::string & str, char delim) {
std::vector<T> values;
@ -303,6 +385,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
std::string string_from(bool value);
std::string string_from(const std::vector<int> & values);
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
//
// Filesystem utils
//
@ -326,7 +413,8 @@ struct llama_init_result {
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
@ -378,10 +466,6 @@ std::string llama_detokenize(
const std::vector<llama_token> & tokens,
bool special = true);
// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
//
// Chat template utils
//

View file

@ -94,6 +94,9 @@ namespace console {
simple_io = true;
}
}
if (simple_io) {
_setmode(_fileno(stdin), _O_U8TEXT);
}
#else
// POSIX-specific console initialization
if (!simple_io) {

View file

@ -1,536 +0,0 @@
#include "grammar-parser.h"
#include <cstdint>
#include <cwchar>
#include <string>
#include <utility>
#include <stdexcept>
#include <exception>
namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4;
int len = lookup[highbits];
uint8_t mask = (1 << (8 - len)) - 1;
uint32_t value = first_byte & mask;
const char * end = src + len; // may overrun!
const char * pos = src + 1;
for ( ; pos < end && *pos; pos++) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
}
return std::make_pair(value, pos);
}
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
return result.first->second;
}
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id;
}
static void add_rule(
parse_state & state,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) {
if (state.rules.size() <= rule_id) {
state.rules.resize(rule_id + 1);
}
state.rules[rule_id] = rule;
}
static bool is_digit_char(char c) {
return '0' <= c && c <= '9';
}
static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
}
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src;
const char * end = src + size;
uint32_t value = 0;
for ( ; pos < end && *pos; pos++) {
value <<= 4;
char c = *pos;
if ('a' <= c && c <= 'f') {
value += c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
value += c - 'A' + 10;
} else if ('0' <= c && c <= '9') {
value += c - '0';
} else {
break;
}
}
if (pos != end) {
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
}
return std::make_pair(value, pos);
}
static const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
if (*pos == '#') {
while (*pos && *pos != '\r' && *pos != '\n') {
pos++;
}
} else {
pos++;
}
}
return pos;
}
static const char * parse_name(const char * src) {
const char * pos = src;
while (is_word_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting name at ") + src);
}
return pos;
}
static const char * parse_int(const char * src) {
const char * pos = src;
while (is_digit_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting integer at ") + src);
}
return pos;
}
static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
case 'x': return parse_hex(src + 2, 2);
case 'u': return parse_hex(src + 2, 4);
case 'U': return parse_hex(src + 2, 8);
case 't': return std::make_pair('\t', src + 2);
case 'r': return std::make_pair('\r', src + 2);
case 'n': return std::make_pair('\n', src + 2);
case '\\':
case '"':
case '[':
case ']':
return std::make_pair(src[1], src + 2);
default:
throw std::runtime_error(std::string("unknown escape at ") + src);
}
} else if (*src) {
return decode_utf8(src);
}
throw std::runtime_error("unexpected end of input");
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
static const char * parse_sequence(
parse_state & state,
const char * src,
const std::string & rule_name,
std::vector<llama_grammar_element> & out_elements,
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
auto handle_repetitions = [&](int min_times, int max_times) {
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// the following rewrite rules:
// S{m,n} --> S S S (m times) S'(n-m)
// S'(x) ::= S S'(x-1) |
// (... n-m definitions of these S' rules ...)
// S'(1) ::= S |
// S{m,} --> S S S (m times) S'
// S' ::= S S' |
// S* --> S{0,}
// --> S' ::= S S' |
// S+ --> S{1,}
// --> S S'
// S' ::= S S' |
// S? --> S{0,1}
// --> S'
// S' ::= S |
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
if (min_times == 0) {
out_elements.resize(last_sym_start);
} else {
// Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) {
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
}
}
uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
std::vector<llama_grammar_element> rec_rule(previous_elements);
for (int i = 0; i < n_opt; i++) {
rec_rule.resize(previous_elements.size());
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
if (i > 0 || max_times < 0) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
}
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rec_rule_id, rec_rule);
last_rec_rule_id = rec_rule_id;
}
if (n_opt > 0) {
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
}
};
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = out_elements.size();
while (*pos != '"') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s)
pos++;
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
if (*pos == '^') {
pos++;
start_type = LLAMA_GRETYPE_CHAR_NOT;
}
last_sym_start = out_elements.size();
while (*pos != ']') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size()
? LLAMA_GRETYPE_CHAR_ALT
: start_type;
out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
if (!pos[1]) {
throw std::runtime_error("unexpected end of input");
}
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
}
}
pos = parse_space(pos + 1, is_nested);
} else if (is_word_char(*pos)) { // rule reference
const char * name_end = parse_name(pos);
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
last_sym_start = out_elements.size();
// output reference to synthesized rule
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '.') { // any char
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, -1);
} else if (*pos == '+') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(1, -1);
} else if (*pos == '?') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, 1);
} else if (*pos == '{') {
pos = parse_space(pos + 1, is_nested);
if (!is_digit_char(*pos)) {
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
int max_times = -1;
if (*pos == '}') {
max_times = min_times;
pos = parse_space(pos + 1, is_nested);
} else if (*pos == ',') {
pos = parse_space(pos + 1, is_nested);
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
if (*pos != '}') {
throw std::runtime_error(std::string("expecting '}' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else {
throw std::runtime_error(std::string("expecting ',' at ") + pos);
}
handle_repetitions(min_times, max_times);
} else {
break;
}
}
return pos;
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested) {
std::vector<llama_grammar_element> rule;
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
while (*pos == '|') {
rule.push_back({LLAMA_GRETYPE_ALT, 0});
pos = parse_space(pos + 1, true);
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
}
rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rule_id, rule);
return pos;
}
static const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
uint32_t rule_id = get_symbol_id(state, src, name_len);
const std::string name(src, name_len);
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
throw std::runtime_error(std::string("expecting ::= at ") + pos);
}
pos = parse_space(pos + 3, true);
pos = parse_alternates(state, pos, name, rule_id, false);
if (*pos == '\r') {
pos += pos[1] == '\n' ? 2 : 1;
} else if (*pos == '\n') {
pos++;
} else if (*pos) {
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
}
return parse_space(pos, true);
}
parse_state parse(const char * src) {
try {
parse_state state;
const char * pos = parse_space(src, true);
while (*pos) {
pos = parse_rule(state, pos);
}
// Validate the state to ensure that all rules are defined
for (const auto & rule : state.rules) {
for (const auto & elem : rule) {
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
// Ensure that the rule at that location exists
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
// Get the name of the rule that is missing
for (const auto & kv : state.symbol_ids) {
if (kv.second == elem.value) {
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
}
}
}
}
}
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
return parse_state();
}
}
static void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c));
} else {
// cop out of encoding UTF-8
fprintf(file, "<U+%04X>", c);
}
}
static bool is_char_element(llama_grammar_element elem) {
switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
case LLAMA_GRETYPE_CHAR_ANY: return true;
default: return false;
}
}
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) {
switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
case LLAMA_GRETYPE_ALT:
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "(%u) ", elem.value);
break;
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
break;
}
}
fprintf(file, "\n");
}
static void print_rule(
FILE * file,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule,
const std::map<uint32_t, std::string> & symbol_id_names) {
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
throw std::runtime_error(
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
}
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
llama_grammar_element elem = rule[i];
switch (elem.type) {
case LLAMA_GRETYPE_END:
throw std::runtime_error(
"unexpected end of rule: " + std::to_string(rule_id) + "," +
std::to_string(i));
case LLAMA_GRETYPE_ALT:
fprintf(file, "| ");
break;
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
break;
case LLAMA_GRETYPE_CHAR:
fprintf(file, "[");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_NOT:
fprintf(file, "[^");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
fprintf(file, "-");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ALT:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, ".");
break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ANY:
break;
default:
fprintf(file, "] ");
}
}
}
fprintf(file, "\n");
}
void print_grammar(FILE * file, const parse_state & state) {
try {
std::map<uint32_t, std::string> symbol_id_names;
for (const auto & kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first;
}
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
// fprintf(file, "%zu: ", i);
// print_rule_binary(file, state.rules[i]);
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
// fprintf(file, "\n");
}
} catch (const std::exception & err) {
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
}
}
std::vector<const llama_grammar_element *> parse_state::c_rules() {
std::vector<const llama_grammar_element *> ret;
ret.reserve(rules.size());
for (const auto & rule : rules) {
ret.push_back(rule.data());
}
return ret;
}
}

View file

@ -1,29 +0,0 @@
// Implements a parser for an extended Backus-Naur form (BNF), producing the
// binary context-free grammar format specified by llama.h. Supports character
// ranges, grouping, and repetition operators. As an example, a grammar for
// arithmetic might look like:
//
// root ::= expr
// expr ::= term ([-+*/] term)*
// term ::= num | "(" space expr ")" space
// num ::= [0-9]+ space
// space ::= [ \t\n]*
#pragma once
#include "llama.h"
#include <vector>
#include <map>
#include <cstdint>
#include <string>
namespace grammar_parser {
struct parse_state {
std::map<std::string, uint32_t> symbol_ids;
std::vector<std::vector<llama_grammar_element>> rules;
std::vector<const llama_grammar_element *> c_rules();
};
parse_state parse(const char * src);
void print_grammar(FILE * file, const parse_state & state);
}

401
common/log.cpp Normal file
View file

@ -0,0 +1,401 @@
#include "log.h"
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
#include <mutex>
#include <sstream>
#include <thread>
#include <vector>
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void gpt_log_set_verbosity_thold(int verbosity) {
gpt_log_verbosity_thold = verbosity;
}
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
// colors
enum gpt_log_col : int {
GPT_LOG_COL_DEFAULT = 0,
GPT_LOG_COL_BOLD,
GPT_LOG_COL_RED,
GPT_LOG_COL_GREEN,
GPT_LOG_COL_YELLOW,
GPT_LOG_COL_BLUE,
GPT_LOG_COL_MAGENTA,
GPT_LOG_COL_CYAN,
GPT_LOG_COL_WHITE,
};
// disable colors by default
static std::vector<const char *> g_col = {
"",
"",
"",
"",
"",
"",
"",
"",
"",
};
struct gpt_log_entry {
enum ggml_log_level level;
bool prefix;
int64_t timestamp;
std::vector<char> msg;
// signals the worker thread to stop
bool is_end;
void print(FILE * file = nullptr) const {
FILE * fcur = file;
if (!fcur) {
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
// these messages will still be logged to a file
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
return;
}
fcur = stdout;
if (level != GGML_LOG_LEVEL_NONE) {
fcur = stderr;
}
}
if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
if (timestamp) {
// [M.s.ms.us]
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
g_col[GPT_LOG_COL_BLUE],
(int) (timestamp / 1000000 / 60),
(int) (timestamp / 1000000 % 60),
(int) (timestamp / 1000 % 1000),
(int) (timestamp % 1000),
g_col[GPT_LOG_COL_DEFAULT]);
}
switch (level) {
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
default:
break;
}
}
fprintf(fcur, "%s", msg.data());
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
}
fflush(fcur);
}
};
struct gpt_log {
// default capacity - will be expanded if needed
gpt_log() : gpt_log(256) {}
gpt_log(size_t capacity) {
file = nullptr;
prefix = false;
timestamps = false;
running = false;
t_start = t_us();
// initial message size - will be expanded if longer messages arrive
entries.resize(capacity);
for (auto & entry : entries) {
entry.msg.resize(256);
}
head = 0;
tail = 0;
resume();
}
~gpt_log() {
pause();
if (file) {
fclose(file);
}
}
private:
std::mutex mtx;
std::thread thrd;
std::condition_variable cv;
FILE * file;
bool prefix;
bool timestamps;
bool running;
int64_t t_start;
// ring buffer of entries
std::vector<gpt_log_entry> entries;
size_t head;
size_t tail;
// worker thread copies into this
gpt_log_entry cur;
public:
void add(enum ggml_log_level level, const char * fmt, va_list args) {
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
// discard messages while the worker thread is paused
return;
}
auto & entry = entries[tail];
{
// cannot use args twice, so make a copy in case we need to expand the buffer
va_list args_copy;
va_copy(args_copy, args);
#if 1
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
}
#else
// hack for bolding arguments
std::stringstream ss;
for (int i = 0; fmt[i] != 0; i++) {
if (fmt[i] == '%') {
ss << LOG_COL_BOLD;
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
ss << LOG_COL_DEFAULT;
if (fmt[i] == 0) break;
}
ss << fmt[i];
}
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
}
#endif
}
entry.level = level;
entry.prefix = prefix;
entry.timestamp = 0;
if (timestamps) {
entry.timestamp = t_us() - t_start;
}
entry.is_end = false;
tail = (tail + 1) % entries.size();
if (tail == head) {
// expand the buffer
std::vector<gpt_log_entry> new_entries(2*entries.size());
size_t new_tail = 0;
do {
new_entries[new_tail] = std::move(entries[head]);
head = (head + 1) % entries.size();
new_tail = (new_tail + 1);
} while (head != tail);
head = 0;
tail = new_tail;
for (size_t i = tail; i < new_entries.size(); i++) {
new_entries[i].msg.resize(256);
}
entries = std::move(new_entries);
}
cv.notify_one();
}
void resume() {
std::lock_guard<std::mutex> lock(mtx);
if (running) {
return;
}
running = true;
thrd = std::thread([this]() {
while (true) {
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return head != tail; });
cur = entries[head];
head = (head + 1) % entries.size();
}
if (cur.is_end) {
break;
}
cur.print(); // stdout and stderr
if (file) {
cur.print(file);
}
}
});
}
void pause() {
{
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
return;
}
running = false;
// push an entry to signal the worker thread to stop
{
auto & entry = entries[tail];
entry.is_end = true;
tail = (tail + 1) % entries.size();
}
cv.notify_one();
}
thrd.join();
}
void set_file(const char * path) {
pause();
if (file) {
fclose(file);
}
if (path) {
file = fopen(path, "w");
} else {
file = nullptr;
}
resume();
}
void set_colors(bool colors) {
pause();
if (colors) {
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
} else {
for (size_t i = 0; i < g_col.size(); i++) {
g_col[i] = "";
}
}
resume();
}
void set_prefix(bool prefix) {
std::lock_guard<std::mutex> lock(mtx);
this->prefix = prefix;
}
void set_timestamps(bool timestamps) {
std::lock_guard<std::mutex> lock(mtx);
this->timestamps = timestamps;
}
};
//
// public API
//
struct gpt_log * gpt_log_init() {
return new gpt_log;
}
struct gpt_log * gpt_log_main() {
static struct gpt_log log;
return &log;
}
void gpt_log_pause(struct gpt_log * log) {
log->pause();
}
void gpt_log_resume(struct gpt_log * log) {
log->resume();
}
void gpt_log_free(struct gpt_log * log) {
delete log;
}
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
va_list args;
va_start(args, fmt);
log->add(level, fmt, args);
va_end(args);
}
void gpt_log_set_file(struct gpt_log * log, const char * file) {
log->set_file(file);
}
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
log->set_colors(colors);
}
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
log->set_prefix(prefix);
}
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}

View file

@ -1,724 +1,92 @@
#pragma once
#include <chrono>
#include <cstring>
#include <sstream>
#include <iostream>
#include <thread>
#include <vector>
#include <algorithm>
#include <cinttypes>
#include "ggml.h" // for ggml_log_level
// --------------------------------
//
// Basic usage:
//
// --------
//
// The LOG() and LOG_TEE() macros are ready to go by default
// they do not require any initialization.
//
// LOGLN() and LOG_TEELN() are variants which automatically
// include \n character at the end of the log string.
//
// LOG() behaves exactly like printf, by default writing to a logfile.
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
//
// Default logfile is named
// "llama.<threadID>.log"
// Default LOG_TEE() secondary output target is
// stderr
//
// Logs can be dynamically disabled or enabled using functions:
// log_disable()
// and
// log_enable()
//
// A log target can be changed with:
// log_set_target( string )
// creating and opening, or re-opening a file by string filename
// or
// log_set_target( FILE* )
// allowing to point at stderr, stdout, or any valid FILE* file handler.
//
// --------
//
// End of Basic usage.
//
// --------------------------------
// Specifies a log target.
// default uses log_handler() with "llama.log" log file
// this can be changed, by defining LOG_TARGET
// like so:
//
// #define LOG_TARGET (a valid FILE*)
// #include "log.h"
//
// or it can be simply redirected to stdout or stderr
// like so:
//
// #define LOG_TARGET stderr
// #include "log.h"
//
// The log target can also be redirected to a different function
// like so:
//
// #define LOG_TARGET log_handler_different()
// #include "log.h"
//
// FILE* log_handler_different()
// {
// return stderr;
// }
//
// or:
//
// #define LOG_TARGET log_handler_another_one("somelog.log")
// #include "log.h"
//
// FILE* log_handler_another_one(char*filename)
// {
// static FILE* logfile = nullptr;
// (...)
// if( !logfile )
// {
// fopen(...)
// }
// (...)
// return logfile
// }
//
#ifndef LOG_TARGET
#define LOG_TARGET log_handler()
#endif
#ifndef LOG_TEE_TARGET
#define LOG_TEE_TARGET stderr
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid()
{
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
return pid;
}
// Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
{
static bool _multilog = false;
if (multilog != LogTriStateSame)
{
_multilog = multilog == LogTriStateTrue;
}
std::stringstream buf;
buf << log_file_basename;
if (_multilog)
{
buf << ".";
buf << log_get_pid();
}
buf << ".";
buf << log_file_extension;
return buf.str();
}
#ifndef LOG_DEFAULT_FILE_NAME
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
#endif
// Utility for turning #define values into string literals
// so we can have a define for stderr and
// we can print "stderr" instead of literal stderr, etc.
#define LOG_STRINGIZE1(s) #s
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
// Allows disabling timestamps.
// in order to disable, define LOG_NO_TIMESTAMPS
// like so:
//
// #define LOG_NO_TIMESTAMPS
// #include "log.h"
//
#ifndef LOG_NO_TIMESTAMPS
#ifndef _MSC_VER
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#ifndef __GNUC__
# define LOG_ATTRIBUTE_FORMAT(...)
#elif defined(__MINGW32__)
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
#define LOG_TIMESTAMP_FMT "%s"
#define LOG_TIMESTAMP_VAL ,""
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#ifdef LOG_TEE_TIMESTAMPS
#ifndef _MSC_VER
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else
#define LOG_TEE_TIMESTAMP_FMT "%s"
#define LOG_TEE_TIMESTAMP_VAL ,""
#endif
#define LOG_DEFAULT_DEBUG 1
#define LOG_DEFAULT_LLAMA 0
// Allows disabling file/line/function prefix
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
// like so:
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via gpt_log_set_verbosity()
extern int gpt_log_verbosity_thold;
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
// the gpt_log uses an internal worker thread to print/write log messages
// when the worker thread is paused, incoming log messages are discarded
struct gpt_log;
struct gpt_log * gpt_log_init();
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
void gpt_log_free (struct gpt_log * log);
LOG_ATTRIBUTE_FORMAT(3, 4)
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
//
// #define LOG_NO_FILE_LINE_FUNCTION
// #include "log.h"
// regular log output:
//
#ifndef LOG_NO_FILE_LINE_FUNCTION
#ifndef _MSC_VER
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_FLF_FMT "%s"
#define LOG_FLF_VAL ,""
#endif
#ifdef LOG_TEE_FILE_LINE_FUNCTION
#ifndef _MSC_VER
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_TEE_FLF_FMT "%s"
#define LOG_TEE_FLF_VAL ,""
#endif
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
// llm_load_tensors: ggml ctx size = 0.27 MiB
// llm_load_tensors: offloading 32 repeating layers to GPU
// llm_load_tensors: offloading non-repeating layers to GPU
//
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_IMPL(str, ...) \
// with prefix = true, timestamps = true, the log output will look like this:
//
// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
//
// I - info (stdout, V = 0)
// W - warning (stderr, V = 0)
// E - error (stderr, V = 0)
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
//
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
// helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
//
// for example:
//
// LOG_DBG("this is a debug message: %d\n", expensive_function());
//
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
//
#define LOG_TMPL(level, verbosity, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \
if ((verbosity) <= gpt_log_verbosity_thold) { \
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
} \
} while (0)
#else
#define LOG_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
} while (0)
#endif
// INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD
//
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
} while (0)
#else
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
} while (0)
#endif
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
// The '\0' as a last argument, is a trick to bypass the silly
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
// so we can have a single macro which can be called just like printf.
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
// Main LOG macro.
// behaves like printf, and supports arguments the exact same way.
//
#if !defined(_MSC_VER) || defined(__clang__)
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// Main TEE macro.
// does the same as LOG
// and
// simultaneously writes stderr.
//
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
#if !defined(_MSC_VER) || defined(__clang__)
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
#if !defined(_MSC_VER) || defined(__clang__)
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{
static bool _initialized = false;
static bool _append = false;
static bool _disabled = filename.empty() && target == nullptr;
static std::string log_current_filename{filename};
static FILE *log_current_target{target};
static FILE *logfile = nullptr;
if (change)
{
if (append != LogTriStateSame)
{
_append = append == LogTriStateTrue;
return logfile;
}
if (disable == LogTriStateTrue)
{
// Disable primary target
_disabled = true;
}
// If previously disabled, only enable, and keep previous target
else if (disable == LogTriStateFalse)
{
_disabled = false;
}
// Otherwise, process the arguments
else if (log_current_filename != filename || log_current_target != target)
{
_initialized = false;
}
}
if (_disabled)
{
// Log is disabled
return nullptr;
}
if (_initialized)
{
// with fallback in case something went wrong
return logfile ? logfile : stderr;
}
// do the (re)initialization
if (target != nullptr)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
log_current_filename = LOG_DEFAULT_FILE_NAME;
log_current_target = target;
logfile = target;
}
else
{
if (log_current_filename != filename)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
}
logfile = fopen(filename.c_str(), _append ? "a" : "w");
}
if (!logfile)
{
// Verify whether the file was opened, otherwise fallback to stderr
logfile = stderr;
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
fflush(stderr);
// At this point we let the init flag be to true below, and let the target fallback to stderr
// otherwise we would repeatedly fopen() which was already unsuccessful
}
_initialized = true;
return logfile ? logfile : stderr;
}
// INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{
return log_handler1_impl(change, append, disable, filename, target);
}
// Disables logs entirely at runtime.
// Makes LOG() and LOG_TEE() produce no output,
// until enabled back.
#define log_disable() log_disable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_disable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
}
// Enables logs at runtime.
#define log_enable() log_enable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_enable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
}
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
// INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); }
// Enable or disable creating separate log files for each run.
// can ONLY be invoked BEFORE first log use.
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
// Enable or disable append mode for log file.
// can ONLY be invoked BEFORE first log use.
#define log_append(enable) log_append_impl(enable)
// INTERNAL, DO NOT USE
inline FILE *log_append_impl(bool enable)
{
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
}
inline void log_test()
{
log_disable();
LOG("01 Hello World to nobody, because logs are disabled!\n");
log_enable();
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
log_set_target(stderr);
LOG("04 Hello World to stderr!\n");
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("06 Hello World to default log file!\n");
log_set_target(stdout);
LOG("07 Hello World to stdout!\n");
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("08 Hello World to default log file again!\n");
log_disable();
LOG("09 Hello World _1_ into the void!\n");
log_enable();
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
log_disable();
log_set_target("llama.anotherlog.log");
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
log_enable();
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
log_set_target("llama.yetanotherlog.log");
LOG("13 Hello World this time in yet new file?\n");
log_set_target(log_filename_generator("llama_autonamed", "log"));
LOG("14 Hello World in log with generated filename!\n");
#ifdef _MSC_VER
LOG_TEE("15 Hello msvc TEE without arguments\n");
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
LOG_TEELN("17 Hello msvc TEELN without arguments\n");
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
LOG("19 Hello msvc LOG without arguments\n");
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
LOGLN("21 Hello msvc LOGLN without arguments\n");
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
#endif
}
inline bool log_param_single_parse(const std::string & param)
{
if ( param == "--log-test")
{
log_test();
return true;
}
if ( param == "--log-disable")
{
log_disable();
return true;
}
if ( param == "--log-enable")
{
log_enable();
return true;
}
if (param == "--log-new")
{
log_multilog(true);
return true;
}
if (param == "--log-append")
{
log_append(true);
return true;
}
return false;
}
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
{
if ( param == "--log-file")
{
if (!check_but_dont_parse)
{
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
}
return true;
}
return false;
}
inline void log_print_usage()
{
printf("log options:\n");
/* format
printf(" -h, --help show this help message and exit\n");*/
/* spacing
printf("__-param----------------Description\n");*/
printf(" --log-test Run simple logging test\n");
printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n");
printf(" --log-new Create a separate new log file on start. "
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
printf(" --log-append Don't truncate the old log file.\n");
printf("\n");
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
// INTERNAL, DO NOT USE
inline void log_dump_cmdline_impl(int argc, char **argv)
{
std::stringstream buf;
for (int i = 0; i < argc; ++i)
{
if (std::string(argv[i]).find(' ') != std::string::npos)
{
buf << " \"" << argv[i] <<"\"";
}
else
{
buf << " " << argv[i];
}
}
LOGLN("Cmd:%s", buf.str().c_str());
}
#define log_tostr(var) log_var_to_string_impl(var).c_str()
inline std::string log_var_to_string_impl(bool var)
{
return var ? "true" : "false";
}
inline std::string log_var_to_string_impl(std::string var)
{
return var;
}
inline std::string log_var_to_string_impl(const std::vector<int> & var)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (auto e : var)
{
if (first)
{
first = false;
}
else
{
buf << ", ";
}
buf << std::to_string(e);
}
buf << " ]";
return buf.str();
}
template <typename C, typename T>
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (const auto & token : tokens)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "'" << detokenized << "'"
<< ":" << std::to_string(token);
}
buf << " ]";
return buf.str();
}
template <typename C, typename B>
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (int i = 0; i < batch.n_tokens; ++i)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "\n" << std::to_string(i)
<< ":token '" << detokenized << "'"
<< ":pos " << std::to_string(batch.pos[i])
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
<< ":logits " << std::to_string(batch.logits[i]);
}
buf << " ]";
return buf.str();
}
#ifdef LOG_DISABLE_LOGS
#undef LOG
#define LOG(...) // dummy stub
#undef LOGLN
#define LOGLN(...) // dummy stub
#undef LOG_TEE
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
#undef LOG_TEELN
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
#undef LOG_DISABLE
#define LOG_DISABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_SET_TARGET
#define LOG_SET_TARGET(...) // dummy stub
#undef LOG_DUMP_CMDLINE
#define LOG_DUMP_CMDLINE(...) // dummy stub
#endif // LOG_DISABLE_LOGS
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT, verbosity, __VA_ARGS__)

View file

@ -2,8 +2,11 @@
#include "common.h"
#include "log.h"
#include <cinttypes>
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <thread>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) {

View file

@ -1,395 +1,119 @@
#define LLAMA_API_INTERNAL
#include "sampling.h"
#include <random>
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context();
#include "common.h"
result->params = params;
result->grammar = nullptr;
#include <cmath>
#include <unordered_map>
// if there is a grammar, parse it
if (!params.grammar.empty()) {
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h
template<typename T>
struct ring_buffer {
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
// will be empty (default) if there are parse errors
if (result->parsed_grammar.rules.empty()) {
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
delete result;
return nullptr;
T & front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
// Ensure that there is a "root" node.
if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
delete result;
return nullptr;
const T & front() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
struct llama_grammar * grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
T & back() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
result->grammar = grammar;
return data[pos];
}
result->prev.resize(params.n_prev);
const T & back() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
result->n_valid = 0;
void push_back(const T & value) {
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;
} else {
sz++;
}
data[pos] = value;
pos = (pos + 1) % capacity;
}
llama_sampling_set_rng_seed(result, params.seed);
T pop_front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
T value = data[first];
first = (first + 1) % capacity;
sz--;
return value;
}
const T & rat(size_t i) const {
if (i >= sz) {
throw std::runtime_error("ring buffer: index out of bounds");
}
return data[(first + sz - i - 1) % capacity];
}
std::vector<T> to_vector() const {
std::vector<T> result;
result.reserve(sz);
for (size_t i = 0; i < sz; i++) {
result.push_back(data[(first + i) % capacity]);
}
return result;
}
void llama_sampling_free(struct llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
}
delete ctx;
}
void llama_sampling_reset(llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
ctx->grammar = NULL;
void clear() {
// here only reset the status of the buffer
sz = 0;
first = 0;
pos = 0;
}
if (!ctx->parsed_grammar.rules.empty()) {
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
struct llama_grammar * grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
ctx->grammar = grammar;
bool empty() const {
return sz == 0;
}
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear();
ctx->n_valid = 0;
}
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = std::random_device{}();
}
ctx->rng.seed(seed);
}
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
if (dst->grammar) {
llama_grammar_free(dst->grammar);
dst->grammar = nullptr;
size_t size() const {
return sz;
}
if (src->grammar) {
dst->grammar = llama_grammar_copy(src->grammar);
}
size_t capacity = 0;
size_t sz = 0;
size_t first = 0;
size_t pos = 0;
std::vector<T> data;
};
dst->prev = src->prev;
}
struct gpt_sampler {
gpt_sampler_params params;
llama_token llama_sampling_last(llama_sampling_context * ctx) {
return ctx->prev.back();
}
struct llama_sampler * grmr;
struct llama_sampler * chain;
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
const int size = ctx_sampling->prev.size();
ring_buffer<llama_token> prev;
n = std::min(n, size);
std::vector<llama_token_data> cur;
std::string result;
llama_token_data_array cur_p;
for (int i = size - n; i < size; i++) {
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
}
void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);
return result;
}
std::string llama_sampling_print(const llama_sampling_params & params) {
char result[1024];
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
return std::string(result);
}
std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) {
for (auto sampler_type : params.samplers_sequence) {
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
if (!sampler_type_name.empty()) {
result += "-> " + sampler_type_name + " ";
}
}
} else {
result += "-> mirostat ";
}
return result;
}
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
switch (sampler_type) {
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMPERATURE: return "temperature";
default : return "";
}
}
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K},
{"top_p", llama_sampler_type::TOP_P},
{"typical_p", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"temperature", llama_sampler_type::TEMPERATURE}
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
{"top-k", llama_sampler_type::TOP_K},
{"top-p", llama_sampler_type::TOP_P},
{"nucleus", llama_sampler_type::TOP_P},
{"typical-p", llama_sampler_type::TYPICAL_P},
{"typical", llama_sampler_type::TYPICAL_P},
{"min-p", llama_sampler_type::MIN_P},
{"tfs-z", llama_sampler_type::TFS_Z},
{"tfs", llama_sampler_type::TFS_Z},
{"temp", llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names.size());
for (const auto & name : names)
{
auto sampler_item = sampler_canonical_name_map.find(name);
if (sampler_item != sampler_canonical_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
else
{
if (allow_alt_names)
{
sampler_item = sampler_alt_name_map.find(name);
if (sampler_item != sampler_alt_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
}
}
}
return sampler_types;
}
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
std::unordered_map<char, llama_sampler_type> sampler_name_map {
{'k', llama_sampler_type::TOP_K},
{'p', llama_sampler_type::TOP_P},
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names_string.size());
for (const auto & c : names_string) {
const auto sampler_item = sampler_name_map.find(c);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
}
}
return sampler_types;
}
// no reasons to expose this function in header
static void sampler_queue(
struct llama_context * ctx_main,
const llama_sampling_params & params,
llama_token_data_array & cur_p,
size_t min_keep) {
const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent;
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
for (auto sampler_type : samplers_sequence) {
switch (sampler_type) {
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case llama_sampler_type::TEMPERATURE:
if (dynatemp_range > 0) {
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
} else {
llama_sample_temp(ctx_main, &cur_p, temp);
}
break;
default : break;
}
}
}
static llama_token llama_sampling_sample_impl(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool is_resampling) {
const llama_sampling_params & params = ctx_sampling->params;
const float temp = params.temp;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
std::vector<float> original_logits;
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
if (ctx_sampling->grammar != NULL && !is_resampling) {
GGML_ASSERT(!original_logits.empty());
}
llama_token id = 0;
if (temp < 0.0) {
// greedy sampling, with probs
llama_sample_softmax(ctx_main, &cur_p);
id = cur_p.data[0].id;
} else if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx_main, &cur_p);
} else {
if (mirostat == 1) {
const int mirostat_m = 100;
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
} else if (mirostat == 2) {
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else {
// temperature sampling
size_t min_keep = std::max(1, params.min_keep);
sampler_queue(ctx_main, params, cur_p, min_keep);
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
//{
// const int n_top = 10;
// LOG("top %d candidates:\n", n_top);
// for (int i = 0; i < n_top; i++) {
// const llama_token id = cur_p.data[i].id;
// (void)id; // To avoid a warning that id is unused when logging is disabled.
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
// }
//}
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
}
}
if (ctx_sampling->grammar != NULL && !is_resampling) {
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
// Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
// If the token is not valid according to the grammar, perform resampling
if (!is_valid) {
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
// Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
}
}
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
return id;
}
static llama_token_data_array llama_sampling_prepare_impl(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool apply_grammar,
std::vector<float> * original_logits) {
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;
const bool penalize_nl = params.penalize_nl;
auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + n_vocab};
}
// apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
if (ctx_cfg) {
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
cur.resize(n_vocab);
@ -397,64 +121,338 @@ static llama_token_data_array llama_sampling_prepare_impl(
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
cur_p = { cur.data(), cur.size(), -1, false };
}
};
// apply penalties
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
std::string gpt_sampler_params::print() const {
char result[1024];
llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
top_k, tfs_z, top_p, min_p, typ_p, temp,
mirostat, mirostat_eta, mirostat_tau);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
return std::string(result);
}
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf;
auto * result = new gpt_sampler {
/* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
/* .cur_p = */ {},
};
llama_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias(
llama_n_vocab(model),
params.logit_bias.size(),
params.logit_bias.data()));
llama_sampler_chain_add(result->chain,
llama_sampler_init_penalties(
llama_n_vocab (model),
llama_token_eos(model),
llama_token_nl (model),
params.penalty_last_n,
params.penalty_repeat,
params.penalty_freq,
params.penalty_present,
params.penalize_nl,
params.ignore_eos));
if (params.temp > 0.0f) {
if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case GPT_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TFS_Z:
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
}
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
} else {
GGML_ASSERT(false && "unknown mirostat version");
}
} else {
if (params.n_probs > 0) {
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
//
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
// it is much faster, since we avoid sorting all tokens and should give a good approximation
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
}
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
}
// apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) {
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
}
return cur_p;
return result;
}
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
// Call the implementation function with is_resampling set to false by default
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
}
void gpt_sampler_free(struct gpt_sampler * gsmpl) {
if (gsmpl) {
llama_sampler_free(gsmpl->grmr);
llama_token_data_array llama_sampling_prepare(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx,
bool apply_grammar,
std::vector<float> * original_logits) {
return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
}
llama_sampler_free(gsmpl->chain);
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar) {
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
delete gsmpl;
}
}
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
llama_sampler_accept(gsmpl->chain, token);
gsmpl->prev.push_back(token);
}
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
llama_sampler_reset(gsmpl->grmr);
llama_sampler_reset(gsmpl->chain);
}
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
return new gpt_sampler {
/* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .chain = */ llama_sampler_clone(gsmpl->chain),
/* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur,
/* .cur_p = */ gsmpl->cur_p,
};
}
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
// TODO: measure grammar performance
if (gsmpl) {
llama_perf_sampler_print(gsmpl->chain);
}
if (ctx) {
llama_perf_context_print(ctx);
}
}
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
gsmpl->set_logits(ctx, idx);
auto & grmr = gsmpl->grmr;
auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
if (grammar_first) {
llama_sampler_apply(grmr, &cur_p);
}
llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
const llama_token id = cur_p.data[cur_p.selected].id;
if (grammar_first) {
return id;
}
// check if it the sampled token fits the grammar
{
llama_token_data single_token_data = { id, 1.0f, 0.0f };
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
llama_sampler_apply(grmr, &single_token_data_array);
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
if (is_valid) {
return id;
}
}
// resampling:
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p);
llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
return cur_p.data[cur_p.selected].id;
}
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
return llama_sampler_get_seed(gsmpl->chain);
}
// helpers
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
return &gsmpl->cur_p;
}
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
return gsmpl->prev.rat(0);
}
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
std::string result = "logits ";
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
}
return result;
}
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
n = std::min(n, (int) gsmpl->prev.size());
if (n <= 0) {
return "";
}
std::string result;
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
for (int i = n - 1; i >= 0; i--) {
const llama_token id = gsmpl->prev.rat(i);
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
result += llama_token_to_piece(ctx_main, id);
}
return result;
}
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
default : return '?';
}
}
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
default : return "";
}
}
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
{ "top_k", GPT_SAMPLER_TYPE_TOP_K },
{ "top_p", GPT_SAMPLER_TYPE_TOP_P },
{ "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
{ "top-k", GPT_SAMPLER_TYPE_TOP_K },
{ "top-p", GPT_SAMPLER_TYPE_TOP_P },
{ "nucleus", GPT_SAMPLER_TYPE_TOP_P },
{ "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "min-p", GPT_SAMPLER_TYPE_MIN_P },
{ "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
};
std::vector<gpt_sampler_type> samplers;
samplers.reserve(names.size());
for (const auto & name : names) {
auto sampler = sampler_canonical_name_map.find(name);
if (sampler != sampler_canonical_name_map.end()) {
samplers.push_back(sampler->second);
} else {
if (allow_alt_names) {
sampler = sampler_alt_name_map.find(name);
if (sampler != sampler_alt_name_map.end()) {
samplers.push_back(sampler->second);
}
}
}
}
return samplers;
}
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
};
std::vector<gpt_sampler_type> samplers;
samplers.reserve(chars.size());
for (const auto & c : chars) {
const auto sampler = sampler_name_map.find(c);
if (sampler != sampler_name_map.end()) {
samplers.push_back(sampler->second);
}
}
return samplers;
}

View file

@ -2,159 +2,82 @@
#include "llama.h"
#include "grammar-parser.h"
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
// sampler types
enum class llama_sampler_type : char {
TOP_K = 'k',
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
TYPICAL_P = 'y',
TEMPERATURE = 't'
};
// sampling parameters
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
llama_sampler_type::TFS_Z,
llama_sampler_type::TYPICAL_P,
llama_sampler_type::TOP_P,
llama_sampler_type::MIN_P,
llama_sampler_type::TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // how strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
std::vector<llama_token> penalty_prompt_tokens;
bool use_penalty_prompt_tokens = false;
} llama_sampling_params;
// general sampler context
// TODO: move to llama.h
struct llama_sampling_context {
// parameters that will be used for sampling
llama_sampling_params params;
// mirostat sampler state
float mirostat_mu;
llama_grammar * grammar;
// internal
grammar_parser::parse_state parsed_grammar;
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;
size_t n_valid; // Number of correct top tokens with correct probabilities.
std::mt19937 rng;
};
#include "common.h"
// Create a new sampling context instance.
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
#include <string>
#include <vector>
void llama_sampling_free(struct llama_sampling_context * ctx);
// Reset the sampler context
// - clear prev tokens
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);
// Set the sampler seed
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
// Get the last sampled token
llama_token llama_sampling_last(llama_sampling_context * ctx);
// Get a string representation of the last sampled tokens
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
// Print sampling parameters into a string
std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params);
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
// llama_sampling_reset when a sequence ends
// gpt_sampler extends llama_sampler with additional functionality:
//
// required:
// - ctx_main: context to use for sampling
// - ctx_sampling: sampling-specific context
// - grammar support
// - custom sampler logic based on the parameters
// - history of the last accepted tokens
// - performance metrics
//
// optional:
// - ctx_cfg: context to use for classifier-free guidance
// - idx: sample from llama_get_logits_ith(ctx, idx)
// This goal is to have a common implementation of the sampling logic shared across the examples.
// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
// complex (top-k, top-p, etc).
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
// Another example is related to the grammar. In general, the grammar constraints applied on the full
// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
// grammar constraints are applied to the full vocabulary and the token is resampled.
//
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
// be moved into the core llama library.
//
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
// This can be used to access the probabilities of the rest of the non-sampled tokens.
//
// TODO: measure grammar performance
//
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = -1);
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
llama_token_data_array llama_sampling_prepare(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0,
bool apply_grammar = true,
std::vector<float> * original_logits = nullptr);
struct gpt_sampler;
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar);
// llama_sampler API overloads
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
void gpt_sampler_free(struct gpt_sampler * gsmpl);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
void gpt_sampler_reset (struct gpt_sampler * gsmpl);
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
// arguments can be nullptr to skip printing
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
// extended sampling implementation:
//
// - set logits
// - apply the configured sampler chain
// - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
//
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
// helpers
// access the internal list of current candidate tokens
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
// get the last accepted token
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
// print the sampler chain into a string
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
// get a string representation of the last accepted tokens
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,11 @@
#include "train.h"
#include "common.h"
#include <algorithm>
#include <random>
#include <sstream>
#include <functional>
#include <cstring>
struct random_normal_distribution {
std::mt19937 gen;

File diff suppressed because it is too large Load diff

View file

@ -31,6 +31,7 @@ import re
import requests
import sys
import json
import shutil
from hashlib import sha256
from enum import IntEnum, auto
@ -80,6 +81,7 @@ models = [
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
@ -94,6 +96,11 @@ models = [
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
]
@ -122,6 +129,21 @@ def download_model(model):
if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model")
if os.path.isdir(repo):
# If repo is a path on the file system, copy the directory
for file in files:
src_path = os.path.join(repo, file)
dst_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(dst_path):
logger.info(f"{name}: File {dst_path} already exists - skipping")
continue
if os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
logger.info(f"{name}: Copied {src_path} to {dst_path}")
else:
logger.warning(f"{name}: Source file {src_path} does not exist")
else:
# If repo is a URL, download the files
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):

View file

@ -116,7 +116,7 @@ class Tensor:
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
self.dtype= dtype
self.dtype= gguf.GGMLQuantizationType(dtype)
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])

View file

@ -331,6 +331,10 @@ if __name__ == '__main__':
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
super().set_gguf_parameters()
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
return ()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_map: dict[str, PartialLoraTensor] = {}
@ -363,7 +367,13 @@ if __name__ == '__main__':
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
dest = super().modify_tensors(data_torch, name, bid)
dest = list(super().modify_tensors(data_torch, name, bid))
# some archs may have the same tensor for lm_head and output (tie word embeddings)
# in this case, adapters targeting lm_head will fail when using llama-export-lora
# therefore, we ignore them for now
# see: https://github.com/ggerganov/llama.cpp/issues/9065
if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest:
assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B()

View file

@ -2,55 +2,82 @@
# Android
## Build on Android using Termux
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
## Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```
$ mkdir build-android
$ cd build-android
$ export NDK=<your_ndk_directory>
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
$ make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:
```
$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
$ apt update && apt upgrade -y
$ apt install git cmake
```
Now, you can start chatting:
Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
```
$cd /data/data/com.termux/files/home/bin
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
$ curl -L {model-url} -o ~/{model}.gguf
```
Here's a demo of an interactive session running on Pixel 5 phone:
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
```
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
```
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
## Cross-compile using Android NDK
It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
```
$ cmake \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DCMAKE_C_FLAGS="-march=armv8.7a" \
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
-DGGML_OPENMP=OFF \
-DGGML_LLAMAFILE=OFF \
-B build-android
```
Notes:
- While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
- `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
Feel free to adjust the Android ABI for your target. Once the project is configured:
```
$ cmake --build build-android --config Release -j{n}
$ cmake --install build-android --prefix {install-dir} --config Release
```
After installing, go ahead and download the model of your choice to your host system. Then:
```
$ adb shell "mkdir /data/local/tmp/llama.cpp"
$ adb push {install-dir} /data/local/tmp/llama.cpp/
$ adb push {model}.gguf /data/local/tmp/llama.cpp/
$ adb shell
```
In the `adb shell`:
```
$ cd /data/local/tmp/llama.cpp
$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
```
That's it!
Be aware that Android will not find the library path `lib` on its own, so we must specify `LD_LIBRARY_PATH` in order to run the installed executables. Android does support `RPATH` in later API levels, so this could change in the future. Refer to the previous section for information about `context-size` (very important!) and running other `examples`.

259
docs/backend/CANN.md Normal file
View file

@ -0,0 +1,259 @@
# llama.cpp for CANN
- [Background](#background)
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
- [Model Supports](#model-supports)
- [DataType Supports](#datatype-supports)
- [Docker](#docker)
- [Linux](#linux)
- [TODO](#todo)
## Background
**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
**Llama.cpp + CANN**
The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
## News
- 2024.8
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
- Create CANN backend for Ascend NPU.
## OS
| OS | Status | Verified |
|:-------:|:-------:|:----------------------------------------------:|
| Linux | Support | Ubuntu 22.04, OpenEuler22.03 |
## Hardware
### Ascend NPU
**Verified devices**
| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
*Notes:*
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the upper table.
## Model Supports
| Model Name | FP16 | Q8_0 | Q4_0 |
|:----------------------------|:-----:|:----:|:----:|
| AquilaChat2-7B | √ | √ | √ |
| Baichuan-7b | √ | √ | √ |
| Baichuan2-7B-Chat | √ | √ | √ |
| bitnet_b1_58-large | √ | √ | √ |
| bloom-560m | √ | x | √ |
| bloomz-alpaca-560m | √ | x | √ |
| c4ai-command-r-35B-v01 | x | x | x |
| chatglm3-6B | x | x | x |
| chinese-alpaca-2-1.3b | √ | √ | √ |
| CodeShell-7B | √ | √ | √ |
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
| deepseek-coder-6.7B-instruct | x | x | x |
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
| falcon-7b-instruct | √ | √ | √ |
| flan-t5-large | √ | √ | √ |
| gemma-2-9b-it | √ | √ | √ |
| glm-4-9B | x | x | x |
| gpt2 | √ | √ | √ |
| Gpt2-163M | √ | √ | √ |
| granite-3B-code-instruct | √ | √ | √ |
| GritLM-7B | √ | √ | √ |
| internlm2_5-7b-chat | √ | √ | √ |
| koala-7B-HF | √ | √ | √ |
| Llama-2-7b-chat-hf | √ | √ | √ |
| Llama-3-Smaug-8B | √ | √ | √ |
| Llama2-Chinese-7b-Chat | √ | √ | √ |
| Llama3-8B | √ | √ | √ |
| Llama3-8b-chinese | √ | √ | √ |
| mamba-130m-hf | √ | √ | √ |
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
| mpt-7B | √ | √ | √ |
| OLMo-1B-hf | √ | √ | √ |
| OpenELM-3B-Instruct | √ | √ | √ |
| Orion-14b-base | √ | √ | √ |
| phi1 | x | x | x |
| phi2 | x | x | x |
| Phi-3-mini-4k-instruct | √ | √ | √ |
| plamo-13b | √ | √ | √ |
| pythia-70M | x | x | x |
| Qwen-7B | √ | √ | √ |
| Qwen2-1.5B-Instruct | √ | x | √ |
| Refact-1_6B-fim | √ | √ | √ |
| SmolLM-135M | √ | √ | √ |
| stablelm-zephyr | x | x | x |
| stablelm-2-zephyr-1_6b | x | x | x |
| starcoderbase-1b | √ | √ | √ |
| starcoder2-3b | √ | √ | √ |
| vigogne-7b-chat | √ | √ | √ |
| xverse-7b-chat | √ | √ | √ |
| Yi-6b-Chat | √ | √ | √ |
## DataType Supports
| DataType | Status |
|:----------------------:|:-------:|
| FP16 | Support |
| Q8_0 | Support |
| Q4_0 | Support |
## Docker
### Build Images
You can get a image with llama.cpp in one command.
```sh
docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
```
### Run container
```sh
# Find all cards.
npu-smi info
# Select the cards that you want to use, make sure these cards are not used by someone.
# Following using cards of device0.
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
```
*Notes:*
- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
## Linux
### I. Setup Environment
1. **Install Ascend Driver and firmware**
```sh
# create driver running user.
sudo groupadd -g HwHiAiUser
sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
sudo usermod -aG HwHiAiUser $USER
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
# and install driver.
sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
```
Once installed, run `npu-smi info` to check whether driver is installed successfully.
```sh
+-------------------------------------------------------------------------------------------+
| npu-smi 24.1.rc2 Version: 24.1.rc2 |
+----------------------+---------------+----------------------------------------------------+
| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)|
| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) |
+======================+===============+====================================================+
| 2 xxx | OK | 64.4 51 15 / 15 |
| 0 | 0000:01:00.0 | 0 1873 / 15077 0 / 32768 |
+======================+===============+====================================================+
| 5 xxx | OK | 64.0 52 15 / 15 |
| 0 | 0000:81:00.0 | 0 1874 / 15077 0 / 32768 |
+======================+===============+====================================================+
| No running processes found in NPU 2 |
+======================+===============+====================================================+
| No running processes found in NPU 5 |
+======================+===============+====================================================+
```
2. **Install Ascend Firmware**
```sh
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
# and install driver.
sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
```
If the following messaage appers, firmware is installed successfully.
```sh
Firmware package installed successfully!
```
3. **Install CANN toolkit and kernels**
CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
```sh
pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
```
Set Ascend Variables:
```sh
echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
source ~/.bashrc
```
Upon a successful installation, CANN is enabled for the available ascend devices.
### II. Build llama.cpp
```sh
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
cmake --build build --config release
```
### III. Run the inference
1. **Retrieve and prepare model**
You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
**Notes**:
- CANN backend only supports FP16/Q4_0/Q8_0 models currently.
2. **Launch inference**
There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically choose the devices with the same backend.
| Device selection | Parameter |
|:----------------:|:--------------------------------------:|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
Examples:
- Use device 0:
```sh
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
- Use multiple devices:
```sh
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
### **GitHub contribution**:
Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
## TODO
- Support more models and data types.

View file

@ -20,17 +20,13 @@
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
### Llama.cpp + SYCL
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
## Recommended Release
@ -45,6 +41,10 @@ The following release is verified with good quality:
## News
- 2024.8
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
- 2024.5
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
- Arch Linux is verified successfully.
@ -80,7 +80,14 @@ The following release is verified with good quality:
### Intel GPU
**Verified devices**
SYCL backend supports Intel GPU Family:
- Intel Data Center Max Series
- Intel Flex Series, Arc Series
- Intel Built-in Arc GPU
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
#### Verified devices
| Intel GPU | Status | Verified Model |
|-------------------------------|---------|---------------------------------------|
@ -88,7 +95,7 @@ The following release is verified with good quality:
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
*Notes:*
@ -105,9 +112,17 @@ The following release is verified with good quality:
**Verified devices**
| Nvidia GPU | Status | Verified Model |
|--------------------------|---------|----------------|
| Ampere Series | Support | A100, A4000 |
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
|--------------------------|-----------|----------------|
| Ampere Series | Supported | A100, A4000 |
| Ampere Series *(Mobile)* | Supported | RTX 40 Series |
| AMD GPU | Status | Verified Model |
|--------------------------|--------------|----------------|
| Radeon Pro | Experimental | W6800 |
| Radeon RX | Experimental | 6700 XT |
Note: AMD GPU support is highly experimental and is incompatible with F16.
Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
## Docker
The docker build option is currently limited to *intel GPU* targets.
@ -179,6 +194,10 @@ Platform #0: Intel(R) OpenCL HD Graphics
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
- **AMD GPU**
To target AMD GPUs with SYCL, the ROCm stack must be installed first.
2. **Install Intel® oneAPI Base toolkit**
- **For Intel GPU**
@ -189,7 +208,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
- **Adding support to Nvidia GPUs**
@ -205,6 +224,19 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB
cmake --build buildWithCublas --config Release
```
- **Adding support to AMD GPUs**
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
# Find your HIPTARGET with rocminfo, under the key 'Name:'
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
cmake --build buildWithrocBLAS --config Release
```
3. **Verify installation and environment**
@ -216,33 +248,48 @@ sycl-ls
- **Intel GPU**
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
```
- **Nvidia GPU**
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
```
- **AMD GPU**
For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
```
[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
```
### II. Build llama.cpp
#### Intel GPU
```
./examples/sycl/build.sh
```
or
```sh
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
@ -254,6 +301,7 @@ cmake --build build --config Release -j -v
```
#### Nvidia GPU
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
@ -271,28 +319,49 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -
# build all binary
cmake --build build --config Release -j -v
```
#### AMD GPU
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with rocBLAS acceleration through SYCL
## AMD
# Use FP32, FP16 is not supported
# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:'
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# build all binary
cmake --build build --config Release -j -v
```
### III. Run the inference
1. Retrieve and prepare model
#### Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment
##### Check device
1. Enable oneAPI running environment
```sh
source /opt/intel/oneapi/setvars.sh
```
3. List devices information
2. List devices information
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```sh
./build/bin/llama-ls-sycl-device
```
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
```
found 2 SYCL devices:
@ -304,12 +373,37 @@ found 2 SYCL devices:
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
```
#### Choose level-zero devices
4. Launch inference
|Chosen Device ID|Setting|
|-|-|
|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
#### Execute
Choose one of following methods to run.
1. Script
- Use device 0:
```sh
./examples/sycl/run-llama2.sh 0
```
- Use multiple devices:
```sh
./examples/sycl/run-llama2.sh
```
2. Command line
Launch inference
There are two device selection modes:
- Single device: Use one device target specified by the user.
- Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend.
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
@ -326,11 +420,6 @@ Examples:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
or run by script:
```sh
./examples/sycl/run_llama2.sh 0
```
- Use multiple devices:
@ -338,12 +427,6 @@ or run by script:
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
Otherwise, you can run the script:
```sh
./examples/sycl/run_llama2.sh
```
*Notes:*
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
@ -390,7 +473,7 @@ c. Verify installation
In the oneAPI command line, run the following to print the available SYCL devices:
```
sycl-ls
sycl-ls.exe
```
There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
@ -411,6 +494,18 @@ b. The new Visual Studio will install Ninja as default. (If not, please install
### II. Build llama.cpp
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
Choose one of following methods to build from source code.
1. Script
```sh
.\examples\sycl\win-build-sycl.bat
```
2. CMake
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
```
@ -425,12 +520,8 @@ cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPI
cmake --build build --config Release -j
```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
```sh
.\examples\sycl\win-build-sycl.bat
```
Or, use CMake presets to build:
```sh
cmake --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli
@ -442,7 +533,9 @@ cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
```
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
3. Visual Studio
You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
*Notes:*
@ -450,23 +543,25 @@ Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choos
### III. Run the inference
1. Retrieve and prepare model
#### Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment
##### Check device
1. Enable oneAPI running environment
On the oneAPI command line window, run the following and step into the llama.cpp directory:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
3. List devices information
2. List devices information
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```
build\bin\ls-sycl-device.exe
build\bin\llama-ls-sycl-device.exe
```
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
@ -479,9 +574,27 @@ found 2 SYCL devices:
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
```
#### Choose level-zero devices
|Chosen Device ID|Setting|
|-|-|
|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
4. Launch inference
#### Execute
Choose one of following methods to run.
1. Script
```
examples\sycl\win-run-llama2.bat
```
2. Command line
Launch inference
There are two device selection modes:
@ -508,11 +621,7 @@ build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website ca
```
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
Otherwise, run the following wrapper script:
```
.\examples\sycl\win-run-llama2.bat
```
Note:
@ -526,17 +635,18 @@ Or
use 1 SYCL GPUs: [0] with Max compute units:512
```
## Environment Variable
#### Build
| Name | Value | Function |
|--------------------|-----------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
#### Runtime
@ -572,9 +682,26 @@ use 1 SYCL GPUs: [0] with Max compute units:512
```
Otherwise, please double-check the GPU driver installation steps.
- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
It's same for other projects including llama.cpp SYCL backend.
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
Device Memory is not enough.
|Reason|Solution|
|-|-|
|Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
|Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
## TODO
- Support row layer split for multiple card runs.
- NA

View file

@ -352,6 +352,37 @@ cmake --build build --config Release
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
```
### CANN
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
Go to `llama.cpp` directory and build using CMake.
```bash
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
cmake --build build --config release
```
You can test with:
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
```bash
llm_load_tensors: CANN buffer size = 13313.00 MiB
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
```
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
### Android
To read documentation for how to build on Android, [click here](./android.md)
### Arm CPU optimized mulmat kernels
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).

View file

@ -20,7 +20,7 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
## Usage
@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
The defaults are:
- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`
- `CUDA_VERSION` set to `12.6.0`
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
The resulting images, are essentially the same as the non-CUDA images:

View file

@ -16,7 +16,6 @@ else()
add_subdirectory(baby-llama)
add_subdirectory(batched-bench)
add_subdirectory(batched)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)

View file

@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);

View file

@ -49,3 +49,12 @@ There are 2 modes of operation:
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
### JSONL output
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
```json lines
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
```

View file

@ -1,49 +1,28 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
// mutates the input string
static std::vector<int> parse_list(char * p) {
std::vector<int> ret;
char * q = p;
while (*p) {
if (*p == ',') {
*p = '\0';
ret.push_back(std::atoi(q));
q = p + 1;
}
++p;
}
ret.push_back(std::atoi(q));
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG_TEE("\n");
static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
return 1;
}
gpt_init();
int is_pp_shared = params.is_pp_shared;
std::vector<int> n_pp = params.n_pp;
@ -100,7 +79,7 @@ int main(int argc, char ** argv) {
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
@ -117,17 +96,18 @@ int main(int argc, char ** argv) {
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
if (!params.batched_bench_output_jsonl) {
LOG("\n");
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG("\n");
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
}
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@ -156,7 +136,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
@ -178,7 +158,7 @@ int main(int argc, char ** argv) {
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
}
@ -195,12 +175,22 @@ int main(int argc, char ** argv) {
const float speed_tg = pl*tg / t_tg;
const float speed = n_kv / t;
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
if(params.batched_bench_output_jsonl) {
LOG(
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
);
} else {
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
}
}
}
}
llama_print_timings(ctx);
LOG("\n");
llama_perf_context_print(ctx);
llama_batch_free(batch);
@ -209,7 +199,7 @@ int main(int argc, char ** argv) {
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}

View file

@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
print("Failed to load model")
exit(1)
}
defer {
llama_free_model(model)
}
@ -37,7 +36,6 @@ var tokens = tokenize(text: prompt, add_bos: true)
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
var context_params = llama_context_default_params()
context_params.seed = 1234
context_params.n_ctx = n_kv_req
context_params.n_batch = UInt32(max(n_len, n_parallel))
context_params.n_threads = 8
@ -48,11 +46,26 @@ guard context != nil else {
print("Failed to initialize context")
exit(1)
}
defer {
llama_free(context)
}
var sparams = llama_sampler_chain_default_params()
let smpl = llama_sampler_chain_init(sparams)
guard smpl != nil else {
print("Failed to initialize sampling")
exit(1)
}
defer {
llama_sampler_free(smpl)
}
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
let n_ctx = llama_n_ctx(context)
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
@ -125,32 +138,7 @@ while n_cur <= n_len {
continue
}
var n_vocab = llama_n_vocab(model)
var logits = llama_get_logits_ith(context, i_batch[i])
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
for token_id in 0 ..< n_vocab {
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
}
var candidates_p: llama_token_data_array = .init(
data: &candidates,
size: candidates.count,
sorted: false
)
let top_k: Int32 = 40
let top_p: Float = 0.9
let temp: Float = 0.4
llama_sample_top_k(context, &candidates_p, top_k, 1)
llama_sample_top_p(context, &candidates_p, top_p, 1)
llama_sample_temp(context, &candidates_p, temp)
let new_token_id = llama_sample_token(context, &candidates_p)
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
// is it an end of stream? -> mark the stream as finished
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
@ -210,9 +198,10 @@ if n_parallel > 1 {
let t_main_end = ggml_time_us()
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
llama_print_timings(context)
llama_perf_sampler_print(smpl)
llama_perf_context_print(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count

View file

@ -1,18 +1,17 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\n");
static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG("\n");
}
int main(int argc, char ** argv) {
@ -21,11 +20,11 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is";
params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
return 1;
}
gpt_init();
// number of parallel batches
int n_parallel = params.n_parallel;
@ -45,7 +44,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1;
}
@ -65,32 +64,39 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
auto sparams = llama_sampler_chain_default_params();
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
const int n_ctx = llama_n_ctx(ctx);
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
return 1;
}
// print the prompt token-by-token
fprintf(stderr, "\n");
LOG("\n");
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
// create a llama_batch
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@ -108,7 +114,7 @@ int main(int argc, char ** argv) {
if (llama_model_has_encoder(model)) {
if (llama_encode(ctx, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
@ -125,7 +131,7 @@ int main(int argc, char ** argv) {
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
@ -136,7 +142,7 @@ int main(int argc, char ** argv) {
//}
if (n_parallel > 1) {
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
}
// main loop
@ -164,36 +170,14 @@ int main(int argc, char ** argv) {
continue;
}
auto n_vocab = llama_n_vocab(model);
auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
const int top_k = 40;
const float top_p = 0.9f;
const float temp = 0.4f;
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temp (ctx, &candidates_p, temp);
const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
LOG_TEE("\n");
LOG("\n");
if (n_parallel > 1) {
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
}
continue;
@ -201,8 +185,7 @@ int main(int argc, char ** argv) {
// if there is only one stream, we print immediately to stdout
if (n_parallel == 1) {
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
}
streams[i] += llama_token_to_piece(ctx, new_token_id);
@ -224,32 +207,33 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
LOG_TEE("\n");
if (n_parallel > 1) {
LOG_TEE("\n");
LOG("\n");
for (int32_t i = 0; i < n_parallel; ++i) {
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
}
}
const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
llama_print_timings(ctx);
LOG("\n");
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx);
fprintf(stderr, "\n");
llama_batch_free(batch);
llama_sampler_free(smpl);
llama_free(ctx);
llama_free_model(model);

View file

@ -1,6 +0,0 @@
set(TARGET llama-bench-matmult)
add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,275 +0,0 @@
#include "common.h"
#include "ggml.h"
#include <locale.h>
#include <assert.h>
#include <math.h>
#include <cstring>
#include <cstdio>
#include <cinttypes>
#include <unordered_map>
#include <queue>
#include <string.h>
#include <cassert>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
static float tensor_sum_elements(const ggml_tensor * tensor) {
double sum = 0;
if (tensor->type == GGML_TYPE_F32) {
for (int j = 0; j < tensor->ne[1]; j++) {
for (int k = 0; k < tensor->ne[0]; k++) {
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
}
}
}
return sum;
}
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
tensor->type, ggml_type_name(tensor->type),
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
float sum = tensor_sum_elements(tensor);
printf("Sum of tensor %s is %6.2f\n", name, sum);
}
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct {
int32_t n_threads = 1;
int32_t n_iterations = 10;
};
static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
fprintf(stderr, "\n");
}
int main(int argc, char ** argv) {
struct benchmark_params_struct benchmark_params;
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
benchmark_params.n_threads = std::stoi(argv[i]);
} else if (arg == "-i" || arg == "--iter") {
if (++i >= argc) {
invalid_param = true;
break;
}
benchmark_params.n_iterations = std::stoi(argv[i]);
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, benchmark_params);
exit(0);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, benchmark_params);
exit(1);
}
print_build_info();
printf("Starting Test\n");
// create the ggml context
struct ggml_context * ctx;
//const int sizex = 4096;
//const int sizey = 11008;
#undef VERBOSE_DEBUGGING
#ifndef VERBOSE_DEBUGGING
const int sizey = 4096;
const int sizex = 11008;
const int sizez = 128;
#else
/* Working - let's increase size */
const int sizey = 1;
const int sizex = (8*32);
const int sizez = 1;
/*const int sizey = 1;
const int sizex = 3*(8*32);
const int sizez = 1;*/
#endif
//printf("Memsize required = %i\n", sizex*sizex);
// TODO: perform the bench for all types or for a user specified type
const ggml_type qtype = GGML_TYPE_Q4_1;
size_t ctx_size = 0;
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += 1024*1024*16;
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/* no_alloc =*/ 0
};
ctx = ggml_init(params);
if (!ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return 1;
}
printf("Creating new tensors\n");
// printf("Creating new tensor m1\n");
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
ggml_set_f32(m11, 1.0f);
// printf("Creating new tensor m1\n");
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
ggml_set_f32(m12, 1.5f);
// printf("Creating new tensor m2\n");
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
ggml_set_f32(m2, 2.0f);
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
// printf("Creating new tensor m11xm2\n");
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, m11xm2);
printf("n_threads=%i\n", benchmark_params.n_threads);
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);
std::vector<uint8_t> work_buffer;
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
TENSOR_DUMP(gf->nodes[0]);
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
int32_t nelements = sizex*sizey;
// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
// Set up a the compute graph
// printf("Creating new tensor q31\n");
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
ggml_build_forward_expand(gf31, q31);
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
//printf("Creating compute graph\n");
struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
ggml_build_forward_expand(gf32, q32);
printf("n_threads=%i\n", benchmark_params.n_threads);
const int dimx = sizex;
const int dimy = sizey;
const int dimz = sizez;
long long int flops_per_dot_product = dimy + dimy;
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
// Let's use the F32 result from above as a reference for the quantized multiplication
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n");
double gflops_sum = 0;
for (int i=0;i<benchmark_params.n_iterations ;i++) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
long long int stop = ggml_time_us();
long long int usec = stop-start;
double gflops = (double)(flops_per_matrix)/usec/1000.0;
gflops_sum += gflops;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
i,
benchmark_params.n_threads,
sizex, sizey, sizez, flops_per_matrix,
usec,gflops);
#ifdef VERBOSE_DEBUGGING
TENSOR_DUMP("res",gf31.nodes[0])
#endif
// Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
if (delta > allowed_delta) {
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
sum_of_F32_reference,
sum_of_Q4_result,
delta,
allowed_delta
);
exit(0);
}
// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
}
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
printf("=====================================================================================\n");
}

View file

@ -9,6 +9,7 @@
#include <climits>
#include <cstring>
#include <cstdarg>
#include <cinttypes>
#include <ctime>
#include <random>
#include <stdexcept>
@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
try {
w->token_embedding_table.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wo.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight.resize(p->dim);
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = {};
} else {
w->wcls.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
catch (std::length_error &) {
@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
return 1;
}
@ -181,26 +182,26 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
}
static void print_sample_weights(TransformerWeights *w){
LOG("----- Quick print of first of the weight vales of all the variables\n");
LOG("%f\n", w->token_embedding_table[0]);
LOG("%f\n", w->rms_att_weight[0]);
LOG("%f\n", w->rms_ffn_weight[0]);
LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
LOG_INF("%f\n", w->token_embedding_table[0]);
LOG_INF("%f\n", w->rms_att_weight[0]);
LOG_INF("%f\n", w->rms_ffn_weight[0]);
LOG("%f\n", w->wq[0]);
LOG("%f\n", w->wk[0]);
LOG("%f\n", w->wv[0]);
LOG("%f\n", w->wo[0]);
LOG("%f\n", w->w1[0]);
LOG("%f\n", w->w2[0]);
LOG("%f\n", w->w3[0]);
LOG("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
LOG_INF("%f\n", w->wq[0]);
LOG_INF("%f\n", w->wk[0]);
LOG_INF("%f\n", w->wv[0]);
LOG_INF("%f\n", w->wo[0]);
LOG_INF("%f\n", w->w1[0]);
LOG_INF("%f\n", w->w2[0]);
LOG_INF("%f\n", w->w3[0]);
LOG_INF("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
struct llama_vocab {
struct my_llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
@ -318,20 +319,20 @@ struct train_params {
};
static void print_params(struct my_llama_hparams * params) {
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
LOG("%s: n_head: %u\n", __func__, params->n_head);
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
LOG("%s: Allocating ", __func__);
LOG_INF("%s: Allocating ", __func__);
int64_t total = 1;
int i = 0;
for (; i < ggml_n_dims(t); ++i) {
@ -524,9 +525,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
return out.str();
}
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
if (is_ggml_file(filename)) {
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
gguf_free(ctx);
} else {
// assume llama2.c vocabulary
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
@ -582,13 +583,13 @@ static void load_vocab(const char * filename, const Config * config, struct llam
const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
for (llama_vocab::id id=0; id<n_vocab; ++id) {
for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
float_t score = file.read_f32();
uint32_t len = file.read_u32();
std::string text = file.read_string(len);
unsigned char byte_val;
llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
if (id == UNKNOWN_TOKEN_ID) {
text = "<unk>";
type = LLAMA_TOKEN_TYPE_UNKNOWN;
@ -630,7 +631,7 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
}
static void save_as_llama_model(
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
) {
// convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
@ -670,7 +671,7 @@ static void save_as_llama_model(
std::vector<const char*> tokens;
std::vector<float> scores;
std::vector<llama_token_type> token_types;
for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
tokens.push_back(token_data.text.c_str());
scores.push_back(token_data.score);
token_types.push_back(token_data.type);
@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
}
int main(int argc, char ** argv) {
gpt_init();
struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, &params)) {
return 1;
}
log_set_target(stdout);
Config config;
TransformerWeights weights = {};
{
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
FILE * file = fopen(params.fn_llama2c_model, "rb");
if (!file) {
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
return 1;
}
// read in the config header
if (fread(&config, sizeof(Config), 1, file) != 1) {
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
return 1;
}
auto shared_weights = config.vocab_size > 0;
@ -896,13 +899,13 @@ int main(int argc, char ** argv) {
// read in the Transformer weights
alloc_weights(&weights, &config, shared_weights);
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
return 1;
}
fclose(file);
}
struct llama_vocab vocab;
struct my_llama_vocab vocab;
load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model;
@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
return 0;

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h"
#include "llama.h"
#include "ggml.h"
@ -12,14 +13,15 @@
#include "ggml-metal.h"
#endif
#include <algorithm>
#include <climits>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <climits>
//////////////////////////////////////////////////
@ -35,9 +37,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@ -271,7 +271,7 @@ struct tokenized_prompt {
size_t max_seq_len;
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@ -390,8 +390,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
return 1;
}
@ -486,7 +485,7 @@ int main(int argc, char ** argv) {
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_threads = params.cpuparams.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);

View file

@ -12,12 +12,9 @@
#include <cstdio>
#include <ctime>
#include <random>
#include <string>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#define DEBUG_POS 5
@ -207,13 +204,6 @@ static ggml_status compute_piter(
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
}
// TODO: enable GPU support when support for GGML_OP_SQRT is added
//#ifdef GGML_USE_METAL
// if (ggml_backend_is_metal(model.backend)) {
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
// }
//#endif
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
if (res == GGML_STATUS_SUCCESS) {
auto extract_i = [](std::string prefix, std::string str) -> int {
@ -229,8 +219,8 @@ static ggml_status compute_piter(
result.eigenvectors.resize(params.n_batch);
result.distances.resize(params.n_batch);
// get output nodes
for (int i = 0; i < gf->n_nodes; ++i) {
auto node = gf->nodes[i];
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
auto node = ggml_graph_node(gf, i);
int iter = -1;
// find b_tensor (without copying data from device)
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {

View file

@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
### Unix-based systems (Linux, macOS, etc.):
```bash
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
```
### Windows:
```powershell
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
```
The above command will output space-separated float values.
@ -50,11 +50,11 @@ The above command will output space-separated float values.
### Unix-based systems (Linux, macOS, etc.):
```bash
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```
### Windows:
```powershell
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```

View file

@ -1,4 +1,6 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <ctime>
@ -31,13 +33,24 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
}
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
const struct llama_model * model = llama_get_model(ctx);
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
// run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
// encoder-only model
if (llama_encode(ctx, batch) < 0) {
LOG_ERR("%s : failed to encode\n", __func__);
}
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
// decoder-only model
if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__);
LOG_ERR("%s : failed to decode\n", __func__);
}
}
for (int i = 0; i < batch.n_tokens; i++) {
@ -45,11 +58,22 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
continue;
}
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
const float * embd = nullptr;
int embd_pos = 0;
float * out = output + batch.seq_id[i][0] * n_embd;
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
// try to get token embeddings
embd = llama_get_embeddings_ith(ctx, i);
embd_pos = i;
GGML_ASSERT(embd != NULL && "failed to get token embeddings");
} else {
// try to get sequence embeddings - supported only when pooling_type is not NONE
embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
embd_pos = batch.seq_id[i][0];
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
}
float * out = output + embd_pos * n_embd;
llama_embd_normalize(embd, out, n_embd, embd_norm);
}
}
@ -57,25 +81,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
return 1;
}
gpt_init();
params.embedding = true;
// For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
llama_backend_init();
llama_numa_init(params.numa);
@ -85,7 +100,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
@ -93,20 +108,21 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
// split the prompt into lines
@ -119,9 +135,9 @@ int main(int argc, char ** argv) {
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, false);
auto inp = ::llama_tokenize(ctx, prompt, true, true);
if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
return 1;
}
@ -132,20 +148,20 @@ int main(int argc, char ** argv) {
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) {
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
}
}
// tokenization stats
if (params.verbose_prompt) {
for (int i = 0; i < (int) inputs.size(); i++) {
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
for (int j = 0; j < (int) inputs[i].size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
}
fprintf(stderr, "\n\n");
LOG("\n\n");
}
}
@ -153,13 +169,23 @@ int main(int argc, char ** argv) {
const int n_prompts = prompts.size();
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
// count number of embeddings
int n_embd_count = 0;
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
for (int k = 0; k < n_prompts; k++) {
n_embd_count += inputs[k].size();
}
} else {
n_embd_count = n_prompts;
}
// allocate output
const int n_embd = llama_n_embd(model);
std::vector<float> embeddings(n_prompts * n_embd, 0);
std::vector<float> embeddings(n_embd_count * n_embd, 0);
float * emb = embeddings.data();
// break into batches
int p = 0; // number of prompts processed already
int e = 0; // number of embeddings already stored
int s = 0; // number of prompts in current batch
for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens
@ -169,11 +195,11 @@ int main(int argc, char ** argv) {
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd;
float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
llama_batch_clear(batch);
p += s;
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
s = 0;
llama_batch_clear(batch);
}
// add to batch
@ -182,39 +208,67 @@ int main(int argc, char ** argv) {
}
// final batch
float * out = emb + p * n_embd;
float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
if (params.embd_out.empty()) {
LOG("\n");
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
for (int j = 0; j < n_embd_count; j++) {
LOG("embedding %d: ", j);
for (int i = 0; i < std::min(3, n_embd); i++) {
if (params.embd_normalize == 0) {
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
LOG(" ... ");
for (int i = n_embd - 3; i < n_embd; i++) {
if (params.embd_normalize == 0) {
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
LOG("\n");
}
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
for (int j = 0; j < n_embd_count; j++) {
// NOTE: if you change this log - update the tests in ci/run.sh
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
}
} else {
// print the first part of the embeddings or for a single prompt, the full embedding
fprintf(stdout, "\n");
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
LOG("embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
fprintf(stdout, "\n");
LOG("\n");
}
// print cosine similarity matrix
if (n_prompts > 1) {
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
LOG("\n");
LOG("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
fprintf(stdout, "%6.6s ", prompts[i].c_str());
LOG("%6.6s ", prompts[i].c_str());
}
fprintf(stdout, "\n");
LOG("\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
LOG("%6.2f ", sim);
}
LOG("%1.10s", prompts[i].c_str());
LOG("\n");
}
fprintf(stdout, "%1.10s", prompts[i].c_str());
fprintf(stdout, "\n");
}
}
}
@ -222,43 +276,45 @@ int main(int argc, char ** argv) {
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
const bool notArray = params.embd_out != "array";
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
for (int j = 0;;) { // at least one iteration (one prompt)
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
fprintf(stdout, "[");
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
LOG("[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++;
if (i < n_embd) fprintf(stdout, ","); else break;
if (i < n_embd) LOG(","); else break;
}
fprintf(stdout, notArray ? "]\n }" : "]");
LOG(notArray ? "]\n }" : "]");
j++;
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
}
fprintf(stdout, notArray ? "\n ]" : "]\n");
LOG(notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) {
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
fprintf(stdout, " [");
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
LOG(",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
LOG(" [");
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f", sim);
LOG("%6.2f", sim);
j++;
if (j < n_prompts) fprintf(stdout, ", "); else break;
if (j < n_embd_count) LOG(", "); else break;
}
fprintf(stdout, " ]");
LOG(" ]");
i++;
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
if (i < n_embd_count) LOG(",\n"); else break;
}
fprintf(stdout, "\n ]");
LOG("\n ]");
}
if (notArray) fprintf(stdout, "\n}\n");
if (notArray) LOG("\n}\n");
}
LOG("\n");
llama_perf_context_print(ctx);
// clean up
llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);

View file

@ -1,11 +1,11 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include "ggml.h"
#include <cstdio>
#include <random>
#include <string>
#include <tuple>
#include <vector>
/**
@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf(" [\n");
LOG(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n");
LOG(" ..., \n");
i2 = ne[2] - n;
}
printf(" [\n");
LOG(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) {
printf(" ..., \n");
LOG(" ..., \n");
i1 = ne[1] - n;
}
printf(" [");
LOG(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) {
printf("..., ");
LOG("..., ");
i0 = ne[0] - n;
}
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
} else {
GGML_ABORT("fatal error");
}
printf("%12.4f", v);
LOG("%12.4f", v);
sum += v;
if (i0 < ne[0] - 1) printf(", ");
if (i0 < ne[0] - 1) LOG(", ");
}
printf("],\n");
LOG("],\n");
}
printf(" ],\n");
LOG(" ],\n");
}
printf(" ]\n");
printf(" sum = %f\n", sum);
LOG(" ]\n");
LOG(" sum = %f\n", sum);
}
}
@ -102,7 +102,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "",
@ -127,12 +127,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
}
static bool run(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
@ -144,14 +144,11 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
gpt_init();
llama_backend_init();
llama_numa_init(params.numa);
@ -168,14 +165,15 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
LOG_ERR("%s : failed to init\n", __func__);
return 1;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
}
bool OK = run(ctx, params);
@ -183,7 +181,8 @@ int main(int argc, char ** argv) {
return 1;
}
llama_print_timings(ctx);
LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx);
llama_free_model(model);

View file

@ -17,9 +17,9 @@ For example:
```bash
./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
-m open-llama-3b-v2.gguf \
-o open-llama-3b-v2-english2tokipona-chat.gguf \
--lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
```
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h"
#include "ggml.h"
#include "ggml-alloc.h"
@ -10,6 +11,12 @@
static bool g_verbose = false;
struct tensor_transformation {
struct ggml_tensor * in;
struct ggml_tensor * out;
bool is_copy;
};
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
@ -50,20 +57,6 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context
return ctx_gguf;
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos);
break;
}
result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos;
}
s = std::move(result);
}
struct file_input {
struct ggml_context * ctx_meta = nullptr;
struct gguf_context * ctx_gguf = nullptr;
@ -212,8 +205,7 @@ struct lora_merge_ctx {
}
// mapping base tensor to out tensor (same shape with base, but different type)
// if out_tensor == nullptr, we only copy it
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
std::vector<tensor_transformation> trans;
for (auto & it : base_model.tensors) {
bool t_a = true;
bool t_b = true;
@ -226,14 +218,22 @@ struct lora_merge_ctx {
// only copy
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
ggml_set_name(cpy_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
trans.push_back({
cpy_tensor,
cpy_tensor,
true,
});
gguf_add_tensor(ctx_out, cpy_tensor);
} else if (t_a && t_b) {
// need merging
struct ggml_tensor * out_tensor = ggml_new_tensor(
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
ggml_set_name(out_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
trans.push_back({
base_tensor,
out_tensor,
false,
});
gguf_add_tensor(ctx_out, out_tensor);
} else {
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
@ -248,12 +248,12 @@ struct lora_merge_ctx {
// process base model tensors
size_t n_merged = 0;
for (auto & it : base_to_out_tensors) {
if (it.second != nullptr) {
merge_tensor(it.first, it.second);
for (auto & it : trans) {
if (!it.is_copy) {
merge_tensor(it.in, it.out);
n_merged++;
} else {
copy_tensor(it.first);
copy_tensor(it.in);
}
}
@ -266,7 +266,7 @@ struct lora_merge_ctx {
}
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
}
void copy_tensor(struct ggml_tensor * base) {
@ -299,6 +299,10 @@ struct lora_merge_ctx {
for (size_t i = 0; i < adapters.size(); ++i) {
auto t_a = adapters[i]->get_tensor(name_lora_a);
auto t_b = adapters[i]->get_tensor(name_lora_b);
// TODO: add support for quantized lora
if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
}
inp_a[i] = ggml_dup_tensor(ctx, t_a);
inp_b[i] = ggml_dup_tensor(ctx, t_b);
}
@ -310,9 +314,9 @@ struct lora_merge_ctx {
// optionally dequantize it
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
auto nels = ggml_nelements(inp_base);
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
const auto * qtype = ggml_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
@ -366,7 +370,7 @@ struct lora_merge_ctx {
// write data to output file
{
auto result = gf->nodes[gf->n_nodes - 1];
auto * result = ggml_graph_node(gf, -1);
size_t len = ggml_nbytes(result);
if (read_buf.size() < len) {
read_buf.resize(len);
@ -388,9 +392,7 @@ struct lora_merge_ctx {
}
};
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
printf("\nNOTE: output model is F16\n");
@ -400,14 +402,13 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
return 1;
}
g_verbose = (params.verbosity == 1);
g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());

View file

@ -1,9 +1,5 @@
#define LLAMA_API_INTERNAL
#include "grammar-parser.h"
#include "ggml.h"
#include "llama.h"
#include "unicode.h"
#include "llama-grammar.h"
#include <cstdio>
#include <cstdlib>
@ -12,29 +8,28 @@
#include <string>
#include <vector>
static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first;
static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
const auto cpts = unicode_cpts_from_utf8(input_str);
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
for (const auto & cpt : cpts) {
const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
if (cur_stacks.empty()) {
if (stacks_cur.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
cur_stacks = prev_stacks;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
stacks_cur = stacks_prev;
return false;
}
++pos;
}
for (const auto & stack : cur_stacks) {
for (const auto & stack : stacks_cur) {
if (stack.empty()) {
return true;
}
@ -85,27 +80,7 @@ int main(int argc, char** argv) {
grammar_str = buffer.str();
}
// Parse the GBNF grammar
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
fprintf(stdout, "%s: failed to parse grammar\n", __func__);
return 1;
}
// Ensure that there is a "root" node.
if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
// Create the LLAMA grammar
auto grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
@ -122,7 +97,7 @@ int main(int argc, char** argv) {
// Validate the input string against the grammar
size_t error_pos;
std::string error_msg;
bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
if (is_valid) {
fprintf(stdout, "Input string is valid according to the grammar.\n");
@ -131,7 +106,7 @@ int main(int argc, char** argv) {
}
// Clean up
llama_grammar_free(grammar);
llama_grammar_free_impl(grammar);
return 0;
}

View file

@ -0,0 +1,5 @@
set(TARGET llama-gen-docs)
add_executable(${TARGET} gen-docs.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,83 @@
#include "arg.h"
#include "common.h"
#include <fstream>
#include <string>
// Export usage message (-h) to markdown format
static void write_table_header(std::ofstream & file) {
file << "| Argument | Explanation |\n";
file << "| -------- | ----------- |\n";
}
static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
file << "| `";
// args
for (const auto & arg : opt.args) {
if (arg == opt.args.front()) {
file << arg;
if (opt.args.size() > 1) file << ", ";
} else {
file << arg << (arg != opt.args.back() ? ", " : "");
}
}
// value hint
if (opt.value_hint) {
std::string md_value_hint(opt.value_hint);
string_replace_all(md_value_hint, "|", "\\|");
file << " " << md_value_hint;
}
if (opt.value_hint_2) {
std::string md_value_hint_2(opt.value_hint_2);
string_replace_all(md_value_hint_2, "|", "\\|");
file << " " << md_value_hint_2;
}
// help text
std::string md_help(opt.help);
string_replace_all(md_help, "\n", "<br/>");
string_replace_all(md_help, "|", "\\|");
file << "` | " << md_help << " |\n";
}
static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
write_table_header(file);
for (const auto & opt : opts) {
write_table_entry(file, *opt);
}
}
static void export_md(std::string fname, llama_example ex) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
gpt_params params;
auto ctx_arg = gpt_params_parser_init(params, ex);
std::vector<llama_arg *> common_options;
std::vector<llama_arg *> sparam_options;
std::vector<llama_arg *> specific_options;
for (auto & opt : ctx_arg.options) {
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
if (opt.is_sparam) {
sparam_options.push_back(&opt);
} else if (opt.in_example(ctx_arg.ex)) {
specific_options.push_back(&opt);
} else {
common_options.push_back(&opt);
}
}
file << "**Common params**\n\n";
write_table(file, common_options);
file << "\n\n**Sampling params**\n\n";
write_table(file, sparam_options);
file << "\n\n**Example-specific params**\n\n";
write_table(file, specific_options);
}
int main(int, char **) {
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
return 0;
}

View file

@ -22,12 +22,20 @@
#endif
enum split_operation : uint8_t {
SPLIT_OP_SPLIT,
SPLIT_OP_MERGE,
OP_NONE,
OP_SPLIT,
OP_MERGE,
};
enum split_mode : uint8_t {
MODE_NONE,
MODE_TENSOR,
MODE_SIZE,
};
struct split_params {
split_operation operation = SPLIT_OP_SPLIT;
split_operation operation = OP_NONE;
split_mode mode = MODE_NONE;
size_t n_bytes_split = 0;
int n_split_tensors = 128;
std::string input;
@ -87,59 +95,52 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
}
bool arg_found = false;
bool is_op_set = false;
bool is_mode_set = false;
if (arg == "-h" || arg == "--help") {
split_print_usage(argv[0]);
exit(0);
}
if (arg == "--version") {
} else if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0);
}
if (arg == "--dry-run") {
} else if (arg == "--dry-run") {
arg_found = true;
params.dry_run = true;
}
if (arg == "--no-tensor-first-split") {
} else if (arg == "--no-tensor-first-split") {
arg_found = true;
params.no_tensor_first_split = true;
}
if (is_op_set) {
} else if (arg == "--merge") {
arg_found = true;
if (params.operation != OP_NONE && params.operation != OP_MERGE) {
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
}
if (arg == "--merge") {
params.operation = OP_MERGE;
} else if (arg == "--split") {
arg_found = true;
is_op_set = true;
params.operation = SPLIT_OP_MERGE;
if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
}
params.operation = OP_SPLIT;
} else if (arg == "--split-max-tensors") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
if (arg == "--split") {
arg_found = true;
is_op_set = true;
params.operation = SPLIT_OP_SPLIT;
}
if (is_mode_set) {
if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
}
if (arg == "--split-max-tensors") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
is_mode_set = true;
params.mode = MODE_TENSOR;
params.n_split_tensors = atoi(argv[arg_idx]);
}
if (arg == "--split-max-size") {
} else if (arg == "--split-max-size") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
is_mode_set = true;
if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
}
params.mode = MODE_SIZE;
params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
}
@ -148,11 +149,20 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
}
}
// the operation is split if not specified
if (params.operation == OP_NONE) {
params.operation = OP_SPLIT;
}
// the split mode is by tensor if not specified
if (params.mode == MODE_NONE) {
params.mode = MODE_TENSOR;
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
}
if (argc - arg_idx < 2) {
if (argc - arg_idx != 2) {
throw std::invalid_argument("error: bad arguments");
}
@ -265,13 +275,15 @@ struct split_strategy {
}
bool should_split(int i_tensor, size_t next_size) {
if (params.n_bytes_split > 0) {
if (params.mode == MODE_SIZE) {
// split by max size per file
return next_size > params.n_bytes_split;
} else {
} else if (params.mode == MODE_TENSOR) {
// split by number of tensors per file
return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
}
// should never happen
GGML_ABORT("invalid mode");
}
void print_info() {
@ -389,10 +401,17 @@ static void gguf_merge(const split_params & split_params) {
int n_split = 1;
int total_tensors = 0;
auto * ctx_out = gguf_init_empty();
// avoid overwriting existing output file
if (std::ifstream(split_params.output.c_str())) {
fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
exit(EXIT_FAILURE);
}
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
auto * ctx_out = gguf_init_empty();
std::vector<uint8_t> read_data;
std::vector<ggml_context *> ctx_metas;
std::vector<gguf_context *> ctx_ggufs;
@ -552,9 +571,9 @@ int main(int argc, const char ** argv) {
split_params_parse(argc, argv, params);
switch (params.operation) {
case SPLIT_OP_SPLIT: gguf_split(params);
case OP_SPLIT: gguf_split(params);
break;
case SPLIT_OP_MERGE: gguf_merge(params);
case OP_MERGE: gguf_merge(params);
break;
default: split_print_usage(argv[0]);
exit(EXIT_FAILURE);

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h"
#include "llama.h"
@ -9,7 +10,7 @@
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
std::vector<std::vector<float>> result;
const llama_model * mdl = llama_get_model(ctx);
const llama_model * model = llama_get_model(ctx);
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
@ -18,16 +19,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
const std::string input_string = instruction + sentences[i];
std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
const int32_t n_toks = inputs.size();
// GritLM seems to have EOS = ""
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
// inputs.push_back(llama_token_eos(mdl));
// inputs.push_back(llama_token_eos(model));
// we want to ignore instruction tokens for mean pooling
const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
#ifdef GRIT_DEBUG
// debug tokens - should be matching as referenced in the GritLM sample
@ -51,7 +52,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
llama_decode(ctx, batch);
// get embedding dimensions
uint64_t n_embd = llama_n_embd(mdl);
uint64_t n_embd = llama_n_embd(model);
// allocate embedding output
std::vector<float> emb_unorm(n_embd, 0.0f);
@ -92,11 +93,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
return result;
}
static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
std::string result;
const llama_model * mdl = llama_get_model(ctx);
llama_token eos_token = llama_token_eos(mdl);
const llama_model * model = llama_get_model(ctx);
llama_token eos_token = llama_token_eos(model);
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false);
@ -104,28 +105,24 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
int32_t i_current_token = 0;
while (true) {
llama_batch_clear(bat);
auto n_inputs = (int32_t)inputs.size();
{
const int32_t n_inputs = inputs.size();
for (int32_t i = 0; i < n_inputs; i++) {
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
}
}
inputs.clear();
llama_decode(ctx, bat);
auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
auto n_candidates = (int32_t)candidates.size();
for (int32_t token = 0; token < n_candidates; token++) {
candidates[token] = llama_token_data{ token, logits[token], 0.0f };
}
auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
if (token == eos_token) {
break;
}
@ -157,20 +154,29 @@ static std::string gritlm_instruction(const std::string & instruction) {
int main(int argc, char * argv[]) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
gpt_init();
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_context_params cparams = llama_context_params_from_gpt_params(params);
llama_backend_init();
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
// create generation context
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
llama_context * ctx = llama_new_context_with_model(model, cparams);
auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
// ### Embedding/Representation ###
// samples taken from: https://github.com/ContextualAI/gritlm#basic
@ -191,7 +197,7 @@ int main(int argc, char * argv[]) {
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const int n_embd = llama_n_embd(mdl);
const int n_embd = llama_n_embd(model);
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@ -208,11 +214,12 @@ int main(int argc, char * argv[]) {
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
{
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
std::string response = generate(ctx, prompt, true);
std::string response = generate(ctx, smpl, prompt, true);
}
llama_sampler_free(smpl);
llama_free(ctx);
llama_free_model(mdl);
llama_free_model(model);
llama_backend_free();
return 0;

View file

@ -1,4 +1,6 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <cmath>
@ -17,15 +19,13 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n");
LOG("\n");
}
struct Stats {
@ -126,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.counts.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ABORT("fatal error");
}
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
}
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
// loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
@ -152,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
LOG("\n");
LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
}
@ -175,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ABORT("fatal error");
}
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
if (!std::isfinite(e.values[j])) {
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
}
@ -240,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
}
if (n_zeros != 0 && is_first) {
fprintf(stderr, "\n");
LOG_INF("\n");
is_first = false;
}
if (n_zeros == n_all) {
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
continue;
}
if (n_zeros > 0) {
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue;
}
@ -259,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
}
if (to_store.size() < m_stats.size()) {
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
}
std::ofstream out(fname, std::ios::binary);
@ -291,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
out.write(m_params.prompt_file.c_str(), len);
}
if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
LOGV(1, "\n");
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(fname, std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, fname);
LOG_ERR("%s: failed to open %s\n",__func__, fname);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, fname);
LOG_ERR("%s: no data in file %s\n", __func__, fname);
return false;
}
for (int i = 0; i < n_entries; ++i) {
@ -313,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
name_as_vec[len] = 0;
@ -324,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
m_stats = {};
return false;
}
@ -337,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
m_stats = {};
return false;
}
@ -433,31 +429,30 @@ static void process_logits(
}
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (params.i_chunk > 0) {
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false;
}
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
return false;
}
@ -479,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
double nll = 0.0;
double nll2 = 0.0;
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@ -515,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
// TODO: use batch.logits to save computations instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
@ -532,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
LOG("%.2f minutes\n", total_seconds / 60.0);
}
if (params.compute_ppl) {
const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
logits.clear();
}
}
printf("\n");
LOG("\n");
if (params.compute_ppl) {
nll2 /= count;
@ -563,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
LOG("Unexpected negative standard deviation of log(prob)\n");
}
}
@ -577,27 +572,28 @@ int main(int argc, char ** argv) {
params.n_ctx = 512;
params.logits_all = true;
params.verbosity = 1;
params.escape = false;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
return 1;
}
gpt_init();
params.n_batch = std::min(params.n_batch, params.n_ctx);
g_collector.set_params(params);
for (const auto & in_file : params.in_files) {
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
if (!g_collector.load_imatrix(in_file.c_str())) {
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
return 1;
}
}
if (params.in_files.size() > 1) {
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix();
}
@ -616,20 +612,20 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
LOG_ERR("%s : failed to init\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
if (!compute_imatrix(ctx, params)) {
@ -638,7 +634,8 @@ int main(int argc, char ** argv) {
g_collector.save_imatrix();
llama_print_timings(ctx);
LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx);
llama_free_model(model);

View file

@ -1,8 +1,9 @@
#include "arg.h"
#include "common.h"
#include "console.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include "grammar-parser.h"
#include <cassert>
#include <cinttypes>
@ -34,6 +35,7 @@
static llama_context ** g_ctx;
static llama_model ** g_model;
static gpt_sampler ** g_smpl;
static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
@ -54,7 +56,7 @@ static void write_logfile(
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
@ -63,7 +65,7 @@ static void write_logfile(
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
@ -81,7 +83,7 @@ static void write_logfile(
yaml_dump_string_multiline(logfile, "output", output.c_str());
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx);
llama_perf_dump_yaml(logfile, ctx);
fclose(logfile);
}
@ -92,9 +94,14 @@ static void sigint_handler(int signo) {
is_interacting = true;
} else {
console::cleanup();
printf("\n");
llama_print_timings(*g_ctx);
LOG("\n");
gpt_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
// make sure all logs are flushed
LOG("Interrupted by user\n");
gpt_log_pause(gpt_log_main());
_exit(130);
}
}
@ -103,109 +110,95 @@ static void sigint_handler(int signo) {
int main(int argc, char ** argv) {
gpt_params params;
llama_sampling_params & sparams = params.sparams;
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("infill", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
gpt_init();
auto & sparams = params.sparams;
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
LOG_ERR("\n************\n");
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
LOG_ERR("\n************\n");
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
printf("************\n\n");
LOG_ERR("\n************\n");
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
LOG("%s: llama backend init\n", __func__);
LOG_INF("%s: llama backend init\n", __func__);
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
llama_model * model = nullptr;
llama_context * ctx = nullptr;
gpt_sampler * smpl = nullptr;
g_model = &model;
g_ctx = &ctx;
g_smpl = &smpl;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model;
ctx = llama_init.context;
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
LOG_DBG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
}
// print system information
{
LOG_TEE("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos);
const bool add_bos = llama_add_bos_token(model);
GGML_ASSERT(!llama_add_eos_token(model));
std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end;
@ -230,18 +223,19 @@ int main(int argc, char ** argv) {
embd_inp.push_back(middle_token);
}
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG_DBG("add_bos: %d\n", add_bos);
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
@ -250,9 +244,8 @@ int main(int argc, char ** argv) {
params.n_keep = (int)embd_inp.size();
}
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
@ -260,21 +253,21 @@ int main(int argc, char ** argv) {
}
if (params.verbose_prompt) {
LOG_TEE("\n");
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
LOG_INF("\n");
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
LOG_TEE("'\n");
LOG_CNT("'\n");
}
LOG_TEE("\n");
LOG_INF("\n");
}
if (params.interactive) {
@ -291,30 +284,30 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
LOG_TEE("%s: interactive mode on.\n", __func__);
LOG_INF("%s: interactive mode on.\n", __func__);
if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n");
LOG_INF("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
}
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
smpl = gpt_sampler_init(model, sparams);
LOG_TEE("\n##### Infill mode #####\n\n");
if (params.infill) {
printf("\n************\n");
printf("no need to specify '--infill', always running infill\n");
printf("************\n\n");
}
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_INF("\n");
LOG_INF("\n##### Infill mode #####\n\n");
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
@ -325,11 +318,11 @@ int main(int argc, char ** argv) {
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
LOG_TEE("== Running in interactive mode. ==\n");
LOG_INF("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
#endif
LOG_TEE( "%s\n", control_message);
LOG_INF( "%s\n", control_message);
is_interacting = params.interactive_first;
}
@ -349,8 +342,6 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while (n_remain != 0 || params.interactive) {
// predict
if (!embd.empty()) {
@ -364,9 +355,8 @@ int main(int argc, char ** argv) {
embd.resize(max_embd_size);
console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
fflush(stdout);
}
// infinite text generation via context swapping
@ -375,14 +365,14 @@ int main(int argc, char ** argv) {
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
@ -390,9 +380,9 @@ int main(int argc, char ** argv) {
n_past -= n_discard;
LOG("after swap: n_past = %d\n", n_past);
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
}
@ -404,16 +394,16 @@ int main(int argc, char ** argv) {
n_eval = params.n_batch;
}
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
LOG("n_past = %d\n", n_past);
LOG_DBG("n_past = %d\n", n_past);
}
}
@ -421,11 +411,11 @@ int main(int argc, char ** argv) {
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
llama_sampling_accept(ctx_sampling, ctx, id, true);
gpt_sampler_accept(smpl, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id);
@ -435,16 +425,16 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget
--n_remain;
LOG("n_remain: %d\n", n_remain);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
@ -457,7 +447,7 @@ int main(int argc, char ** argv) {
if (input_echo) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
if (embd.size() > 1) {
input_tokens.push_back(id);
@ -466,7 +456,6 @@ int main(int argc, char ** argv) {
output_ss << token_str;
}
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
@ -476,13 +465,12 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
fflush(stdout);
printf("\n");
LOG("\n");
console::set_display(console::user_input);
std::string buffer;
std::string line;
@ -538,35 +526,33 @@ int main(int argc, char ** argv) {
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
// LOG_TEE("took new input\n");
is_interacting = false;
}
// deal with end of generation tokens in interactive mode
else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found EOS token\n");
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
LOG_DBG("found EOS token\n");
if (params.interactive) {
is_interacting = true;
printf("\n");
LOG("\n");
console::set_display(console::user_input);
fflush(stdout);
}
}
if (n_past > 0 && is_interacting && !params.interactive) {
LOG("waiting for user input\n");
LOG_DBG("waiting for user input\n");
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix;
printf("%s", buffer.c_str());
LOG("%s", buffer.c_str());
}
std::string line;
@ -584,17 +570,17 @@ int main(int argc, char ** argv) {
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str());
LOG("%s", params.input_suffix.c_str());
}
LOG("buffer: '%s'\n", buffer.c_str());
LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@ -605,9 +591,9 @@ int main(int argc, char ** argv) {
}
n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
LOG("empty line, passing control back\n");
LOG_DBG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
@ -615,7 +601,7 @@ int main(int argc, char ** argv) {
if (n_past > 0) {
if (is_interacting) {
llama_sampling_reset(ctx_sampling);
gpt_sampler_reset(smpl);
}
is_interacting = false;
}
@ -634,22 +620,18 @@ int main(int argc, char ** argv) {
}
}
if (!params.interactive && n_remain <= 0) {
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
fflush(stdout);
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
llama_print_timings(ctx);
LOG("\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
llama_free(ctx);
llama_free_model(model);
llama_sampling_free(ctx_sampling);
gpt_sampler_free(smpl);
llama_backend_free();
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
return 0;
}

View file

@ -14,7 +14,8 @@ Performance testing tool for llama.cpp.
1. [Markdown](#markdown)
2. [CSV](#csv)
3. [JSON](#json)
4. [SQL](#sql)
4. [JSONL](#jsonl)
5. [SQL](#sql)
## Syntax
@ -26,13 +27,17 @@ options:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: 512,128)
-pg <pp,tg> (default: )
-b, --batch-size <n> (default: 2048)
-ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16)
-t, --threads <n> (default: 8)
-C, --cpu-mask <hex,hex> (default: 0x0)
--cpu-strict <0|1> (default: 0)
--poll <0...100> (default: 50)
-ngl, --n-gpu-layers <n> (default: 99)
-rpc, --rpc <rpc_servers> (default: )
-sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
@ -42,7 +47,10 @@ options:
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md)
--prio <0|1|2|3> (default: 0)
--delay <0...N> (seconds) (default: 0)
-o, --output <csv|json|jsonl|md|sql> (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
@ -238,6 +246,19 @@ $ ./llama-bench -o json
]
```
### JSONL
```sh
$ ./llama-bench -o jsonl
```
```json lines
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
```
### SQL
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.

View file

@ -16,6 +16,7 @@
#include <sstream>
#include <string>
#include <vector>
#include <thread>
#include "ggml.h"
#include "llama.h"
@ -123,6 +124,9 @@ static std::string get_cpu_info() {
(LPBYTE)cpu_brand,
&cpu_brand_size) == ERROR_SUCCESS) {
id.assign(cpu_brand, cpu_brand_size);
if (id.find('\0') != std::string::npos) {
id.resize(id.find('\0'));
}
}
RegCloseKey(hKey);
#endif
@ -170,13 +174,14 @@ static std::string get_gpu_info() {
}
// command line params
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) {
switch (format) {
case NONE: return "none";
case CSV: return "csv";
case JSON: return "json";
case JSONL: return "jsonl";
case MARKDOWN: return "md";
case SQL: return "sql";
default: GGML_ABORT("invalid output format");
@ -190,6 +195,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
format = CSV;
} else if (s == "json") {
format = JSON;
} else if (s == "jsonl") {
format = JSONL;
} else if (s == "md") {
format = MARKDOWN;
} else if (s == "sql") {
@ -225,6 +232,9 @@ struct cmd_params {
std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
std::vector<std::string> cpu_mask;
std::vector<bool> cpu_strict;
std::vector<int> poll;
std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode;
@ -236,7 +246,10 @@ struct cmd_params {
std::vector<bool> embeddings;
ggml_numa_strategy numa;
int reps;
ggml_sched_priority prio;
int delay;
bool verbose;
bool progress;
output_formats output_format;
output_formats output_format_stderr;
};
@ -251,6 +264,9 @@ static const cmd_params cmd_params_defaults = {
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
/* cpu_mask */ {"0x0"},
/* cpu_strict */ {false},
/* poll */ {50},
/* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@ -262,7 +278,10 @@ static const cmd_params cmd_params_defaults = {
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* prio */ GGML_SCHED_PRIO_NORMAL,
/* delay */ 0,
/* verbose */ false,
/* progress */ false,
/* output_format */ MARKDOWN,
/* output_format_stderr */ NONE,
};
@ -281,8 +300,13 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
#ifdef GGML_USE_RPC
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
#endif
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@ -292,9 +316,12 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
}
@ -338,6 +365,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps;
params.numa = cmd_params_defaults.numa;
params.prio = cmd_params_defaults.prio;
params.delay = cmd_params_defaults.delay;
params.progress = cmd_params_defaults.progress;
for (int i = 1; i < argc; i++) {
arg = argv[i];
@ -409,6 +439,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
types.push_back(gt);
}
if (invalid_param) {
break;
}
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
} else if (arg == "-ctv" || arg == "--cache-type-v") {
if (++i >= argc) {
@ -425,6 +458,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
types.push_back(gt);
}
if (invalid_param) {
break;
}
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
@ -433,6 +469,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
} else if (arg == "--cpu-strict") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
} else if (arg == "--poll") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.poll.insert(params.poll.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
@ -440,12 +497,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
#ifdef GGML_USE_RPC
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers.push_back(argv[i]);
#endif
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
@ -467,6 +526,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
modes.push_back(mode);
}
if (invalid_param) {
break;
}
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
} else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
@ -541,6 +603,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.reps = std::stoi(argv[i]);
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
} else if (arg == "--delay") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.delay = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
@ -555,6 +629,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else if (arg == "--progress") {
params.progress = true;
} else {
invalid_param = true;
break;
@ -585,6 +661,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
return params;
}
@ -598,6 +677,9 @@ struct cmd_params_instance {
ggml_type type_k;
ggml_type type_v;
int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
int n_gpu_layers;
std::string rpc_servers;
llama_split_mode split_mode;
@ -667,7 +749,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) {
for (const auto & nt : params.n_threads)
for (const auto & cm : params.cpu_mask)
for (const auto & cs : params.cpu_strict)
for (const auto & pl : params.poll) {
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
continue;
@ -681,6 +766,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
@ -707,6 +795,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
@ -733,6 +824,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
@ -769,6 +863,9 @@ struct test {
int n_batch;
int n_ubatch;
int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
bool has_rpc;
ggml_type type_k;
ggml_type type_v;
@ -795,6 +892,9 @@ struct test {
n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads;
cpu_mask = inst.cpu_mask;
cpu_strict = inst.cpu_strict;
poll = inst.poll;
has_rpc = !inst.rpc_servers.empty();
type_k = inst.type_k;
type_v = inst.type_v;
@ -872,13 +972,14 @@ struct test {
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
"n_threads", "type_k", "type_v",
"n_threads", "cpu_mask", "cpu_strict", "poll",
"type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
"avg_ts", "stddev_ts",
};
return fields;
}
@ -887,7 +988,7 @@ struct test {
static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
field == "n_threads" ||
field == "n_threads" || field == "poll" ||
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
@ -896,6 +997,7 @@ struct test {
}
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "cpu_strict" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL;
}
@ -928,7 +1030,8 @@ struct test {
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@ -996,10 +1099,8 @@ struct csv_printer : public printer {
}
};
struct json_printer : public printer {
bool first = true;
static std::string escape_json(const std::string & value) {
static std::string escape_json(const std::string & value) {
std::string escaped;
for (auto c : value) {
if (c == '"') {
@ -1015,9 +1116,9 @@ struct json_printer : public printer {
}
}
return escaped;
}
}
static std::string format_value(const std::string & field, const std::string & value) {
static std::string format_json_value(const std::string & field, const std::string & value) {
switch (test::get_field_type(field)) {
case test::STRING:
return "\"" + escape_json(value) + "\"";
@ -1026,7 +1127,10 @@ struct json_printer : public printer {
default:
return value;
}
}
}
struct json_printer : public printer {
bool first = true;
void print_header(const cmd_params & params) override {
fprintf(fout, "[\n");
@ -1036,7 +1140,7 @@ struct json_printer : public printer {
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
}
}
@ -1059,6 +1163,25 @@ struct json_printer : public printer {
}
};
struct jsonl_printer : public printer {
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
}
}
void print_test(const test & t) override {
fprintf(fout, "{");
print_fields(test::get_fields(), t.get_values());
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
fprintf(fout, "}\n");
fflush(fout);
}
};
struct markdown_printer : public printer {
std::vector<std::string> fields;
@ -1067,7 +1190,7 @@ struct markdown_printer : public printer {
return -30;
}
if (field == "t/s") {
return 16;
return 20;
}
if (field == "size" || field == "params") {
return 10;
@ -1149,6 +1272,15 @@ struct markdown_printer : public printer {
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.emplace_back("n_threads");
}
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
fields.emplace_back("cpu_mask");
}
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
fields.emplace_back("cpu_strict");
}
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
fields.emplace_back("poll");
}
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch");
}
@ -1350,6 +1482,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
return std::unique_ptr<printer>(new csv_printer());
case JSON:
return std::unique_ptr<printer>(new json_printer());
case JSONL:
return std::unique_ptr<printer>(new jsonl_printer());
case MARKDOWN:
return std::unique_ptr<printer>(new markdown_printer());
case SQL:
@ -1383,6 +1517,8 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
set_process_priority(params.prio);
// initialize printer
std::unique_ptr<printer> p = create_printer(params.output_format);
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@ -1402,7 +1538,13 @@ int main(int argc, char ** argv) {
llama_model * lmodel = nullptr;
const cmd_params_instance * prev_inst = nullptr;
int params_idx = 0;
auto params_count = params_instances.size();
for (const auto & inst : params_instances) {
params_idx ++;
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
}
// keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
if (lmodel) {
@ -1428,12 +1570,40 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx);
// cool off before the test
if (params.delay) {
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
}
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
exit(1);
}
tpp.strict_cpu = t.cpu_strict;
tpp.poll = t.poll;
tpp.prio = params.prio;
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool, NULL);
// warmup run
if (t.n_prompt > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
}
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
}
test_gen(ctx, 1, 0, t.n_threads);
}
@ -1443,9 +1613,15 @@ int main(int argc, char ** argv) {
uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
}
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
}
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
}
@ -1463,9 +1639,11 @@ int main(int argc, char ** argv) {
fflush(p_err->fout);
}
llama_print_timings(ctx);
llama_perf_context_print(ctx);
llama_free(ctx);
ggml_threadpool_free(threadpool);
}
llama_free_model(lmodel);

View file

@ -18,6 +18,7 @@ android {
}
externalNativeBuild {
cmake {
arguments += "-DLLAMA_BUILD_COMMON=ON"
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf()
arguments += listOf()

View file

@ -120,7 +120,7 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
LOGi("Using %d threads", n_threads);
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = 2048;
ctx_params.n_threads = n_threads;
ctx_params.n_threads_batch = n_threads;
@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
return env->NewStringUTF(result.str().c_str());
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
return reinterpret_cast<jlong>(batch);
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = true;
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
return reinterpret_cast<jlong>(smpl);
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
@ -381,31 +398,21 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
jobject,
jlong context_pointer,
jlong batch_pointer,
jlong sampler_pointer,
jint n_len,
jobject intvar_ncur
) {
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
const auto model = llama_get_model(context);
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
auto n_vocab = llama_n_vocab(model);
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// sample the most likely token
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {

View file

@ -45,8 +45,10 @@ class LLamaAndroid {
private external fun free_context(context: Long)
private external fun backend_init(numa: Boolean)
private external fun backend_free()
private external fun free_batch(batch: Long)
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
private external fun free_batch(batch: Long)
private external fun new_sampler(): Long
private external fun free_sampler(sampler: Long)
private external fun bench_model(
context: Long,
model: Long,
@ -69,6 +71,7 @@ class LLamaAndroid {
private external fun completion_loop(
context: Long,
batch: Long,
sampler: Long,
nLen: Int,
ncur: IntVar
): String?
@ -101,8 +104,11 @@ class LLamaAndroid {
val batch = new_batch(512, 0, 1)
if (batch == 0L) throw IllegalStateException("new_batch() failed")
val sampler = new_sampler()
if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
Log.i(tag, "Loaded model $pathToModel")
threadLocalState.set(State.Loaded(model, context, batch))
threadLocalState.set(State.Loaded(model, context, batch, sampler))
}
else -> throw IllegalStateException("Model already loaded")
}
@ -114,7 +120,7 @@ class LLamaAndroid {
is State.Loaded -> {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, nlen, ncur)
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
if (str == null) {
break
}
@ -138,6 +144,7 @@ class LLamaAndroid {
free_context(state.context)
free_model(state.model)
free_batch(state.batch)
free_sampler(state.sampler);
threadLocalState.set(State.Idle)
}
@ -161,7 +168,7 @@ class LLamaAndroid {
private sealed interface State {
data object Idle: State
data class Loaded(val model: Long, val context: Long, val batch: Long): State
data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
}
// Enforce only one instance of Llm.

View file

@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
actor LlamaContext {
private var model: OpaquePointer
private var context: OpaquePointer
private var sampling: UnsafeMutablePointer<llama_sampler>
private var batch: llama_batch
private var tokens_list: [llama_token]
var is_done: Bool = false
@ -42,9 +43,15 @@ actor LlamaContext {
self.tokens_list = []
self.batch = llama_batch_init(512, 0, 1)
self.temporary_invalid_cchars = []
let sparams = llama_sampler_chain_default_params()
self.sampling = llama_sampler_chain_init(sparams)
llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
}
deinit {
llama_sampler_free(sampling)
llama_batch_free(batch)
llama_free(context)
llama_free_model(model)
@ -69,10 +76,9 @@ actor LlamaContext {
print("Using \(n_threads) threads")
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 2048
ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads)
ctx_params.n_threads = Int32(n_threads)
ctx_params.n_threads_batch = Int32(n_threads)
let context = llama_new_context_with_model(model, ctx_params)
guard let context else {
@ -144,20 +150,7 @@ actor LlamaContext {
func completion_loop() -> String {
var new_token_id: llama_token = 0
let n_vocab = llama_n_vocab(model)
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
var candidates = Array<llama_token_data>()
candidates.reserveCapacity(Int(n_vocab))
for token_id in 0..<n_vocab {
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
}
candidates.withUnsafeMutableBufferPointer() { buffer in
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
new_token_id = llama_sample_token_greedy(context, &candidates_p)
}
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n")

View file

@ -1,135 +0,0 @@
" Requires an already running llama.cpp server
" To install either copy or symlink to ~/.vim/autoload/llama.vim
" Then start with either :call llama#doLlamaGen(),
" or add a keybind to your vimrc such as
" nnoremap Z :call llama#doLlamaGen()<CR>
" Similarly, you could add an insert mode keybind with
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
"
" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
" let g:llama_api_url = "192.168.1.10:8080"
" llama_overrides can also be set through buffer/window scopes. For instance
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
" Could be added to your .vimrc to automatically set a lower temperature when
" editing a python script
" Additionally, an override dict can be stored at the top of a file
" !*{"stop": ["User:"]}
" Could be added to the start of your chatlog.txt to set the stopping token
" These parameter dicts are merged together from lowest to highest priority:
" server default -> g:llama_overrides -> w:llama_overrides ->
" b:llama_overrides -> in file (!*) overrides
"
" Sublists (like logit_bias and stop) are overridden, not merged
" Example override:
" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
if !exists("g:llama_api_url")
let g:llama_api_url= "127.0.0.1:8080"
endif
if !exists("g:llama_overrides")
let g:llama_overrides = {}
endif
const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
let s:linedict = {}
func s:callbackHandler(bufn, channel, msg)
if len(a:msg) < 3
return
elseif a:msg[0] == "d"
let l:msg = a:msg[6:-1]
else
let l:msg = a:msg
endif
let l:decoded_msg = json_decode(l:msg)
let l:newtext = split(l:decoded_msg['content'], "\n", 1)
if len(l:newtext) > 0
call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
else
echo "nothing genned"
endif
if len(newtext) > 1
let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
endif
if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
echo "Finished generation"
endif
endfunction
func llama#doLlamaGen()
if exists("b:job")
if job_status(b:job) == "run"
call job_stop(b:job)
return
endif
endif
let l:cbuffer = bufnr("%")
let s:linedict[l:cbuffer] = line('$')
let l:buflines = getbufline(l:cbuffer, 1, 1000)
let l:querydata = copy(s:querydata)
call extend(l:querydata, g:llama_overrides)
if exists("w:llama_overrides")
call extend(l:querydata, w:llama_overrides)
endif
if exists("b:llama_overrides")
call extend(l:querydata, b:llama_overrides)
endif
if l:buflines[0][0:1] == '!*'
let l:userdata = json_decode(l:buflines[0][2:-1])
call extend(l:querydata, l:userdata)
let l:buflines = l:buflines[1:-1]
endif
let l:querydata.prompt = join(l:buflines, "\n")
let l:curlcommand = copy(s:curlcommand)
if exists("g:llama_api_key")
call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
endif
let l:curlcommand[2] = json_encode(l:querydata)
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
endfunction
" Echos the tokkenization of the provided string , or cursor to end of word
" Onus is placed on the user to include the preceding space
func llama#tokenizeWord(...)
if (a:0 > 0)
let l:input = a:1
else
exe "normal \"*ye"
let l:input = @*
endif
let l:querydata = {"content": l:input}
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
endfunction
func s:tokenizeWordCallback(plaintext, channel, msg)
echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
endfunction
" Echos the token count of the entire buffer (or provided string)
" Example usage :echo llama#tokenCount()
func llama#tokenCount(...)
if (a:0 > 0)
let l:buflines = a:1
else
let l:buflines = getline(1,1000)
if l:buflines[0][0:1] == '!*'
let l:buflines = l:buflines[1:-1]
endif
let l:buflines = join(l:buflines, "\n")
endif
let l:querydata = {"content": l:buflines}
let l:curlcommand = copy(s:curlcommand)
let l:curlcommand[2] = json_encode(l:querydata)
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
endfunction
func s:tokenCountCallback(channel, msg)
let resp = json_decode(a:msg)
echo len(resp.tokens)
endfunction

View file

@ -36,3 +36,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TARGET llama-minicpmv-cli)
add_executable(${TARGET} minicpmv-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert_image_encoder_to_gguf \
python ./examples/llava/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \
@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
```
```sh
python ./examples/llava/convert_image_encoder_to_gguf \
python ./examples/llava/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
--output-dir path/to/MobileVLM-1.7B_V2 \
@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
```sh
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
```
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.

View file

@ -0,0 +1,99 @@
## MiniCPM-Llama3-V 2.5
### Prepare models and code
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
### Usage
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build for Linux or Mac
```bash
make
make llama-minicpmv-cli
```
Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
### Android
#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```

View file

@ -0,0 +1,107 @@
## MiniCPM-V 2.6
### Prepare models and code
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
Clone llama.cpp:
```bash
git clone git@github.com:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpmv-main
```
### Usage of MiniCPM-V 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version
./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build for Linux or Mac
```bash
make
make llama-minicpmv-cli
```
Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
### Video
Install FFmpeg
```
brew install ffmpeg
brew install pkg-config
```
### Android
#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```

File diff suppressed because it is too large Load diff

View file

@ -18,14 +18,17 @@
# define CLIP_API
#endif
struct clip_ctx;
#ifdef __cplusplus
extern "C" {
#endif
struct clip_ctx;
struct clip_image_size {
int width;
int height;
};
struct clip_image_u8_batch {
struct clip_image_u8 * data;
size_t size;
@ -55,6 +58,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
@ -78,6 +85,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
#ifdef __cplusplus
}
#endif

View file

@ -274,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)
if has_llava_projector:
model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue]
model.vision_model.encoder.layers.pop(-1)
projector = torch.load(args.llava_projector)
for name, data in projector.items():
name = get_tensor_name(name)
@ -288,7 +288,7 @@ if has_llava_projector:
print("Projector tensors added\n")
state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue]
state_dict = model.state_dict()
for name, data in state_dict.items():
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
# we don't need this

View file

@ -1,14 +1,16 @@
#include "ggml.h"
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "base64.hpp"
#include "ggml.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@ -19,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
@ -40,11 +42,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
return true;
}
static const char * sample(struct llama_sampling_context * ctx_sampling,
static const char * sample(struct gpt_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
gpt_sampler_accept(smpl, id, true);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>";
@ -74,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
@ -88,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
LOG_TEE("%s: could not load image from base64 string.\n", __func__);
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL;
}
@ -112,12 +114,10 @@ struct llava_context {
struct llama_model * model = NULL;
};
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\n example usage:\n");
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
static void print_usage(int, char ** argv) {
LOG("\n example usage:\n");
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@ -127,16 +127,16 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n");
LOG_INF("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__);
LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
@ -157,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} else {
@ -178,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}
@ -189,21 +189,21 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// generate the response
LOG_TEE("\n");
LOG("\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
if (!ctx_sampling) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
std::string response = "";
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@ -211,8 +211,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
fflush(stdout);
}
llama_sampling_free(ctx_sampling);
printf("\n");
gpt_sampler_free(smpl);
LOG("\n");
}
static struct llama_model * llava_init(gpt_params * params) {
@ -223,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n" , __func__);
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
@ -246,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
@ -269,65 +269,54 @@ static void llava_free(struct llava_context * ctx_llava) {
llama_backend_free();
}
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
int main(int argc, char ** argv) {
ggml_time_init();
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
gpt_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv, {});
print_usage(argc, argv);
return 1;
}
auto model = llava_init(&params);
auto * model = llava_init(&params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1;
}
if (prompt_contains_image(params.prompt)) {
auto ctx_llava = llava_init_context(&params, model);
auto * ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, "");
auto * image_embed = load_image(ctx_llava, &params, "");
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
} else {
for (auto & image : params.image) {
auto ctx_llava = llava_init_context(&params, model);
auto * ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image);
auto * image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1;
}
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);

View file

@ -1,13 +1,23 @@
#include "clip.h"
#include "common.h"
#include "llama.h"
#include "llava.h"
#include "base64.hpp"
#include "llama.h"
#include <algorithm>
#include <cerrno>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <vector>
#include <numeric>
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
// RGB uint8 image
struct clip_image_u8 {
@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution;
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution;
@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor* result = ggml_graph_node(gf, -1);
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
@ -202,6 +212,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
return true;
}
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
int width = image->nx;
int height = image->ny;
int num_patches = (height / patch_size) * (width / patch_size);
clip_image_f32 * patch = clip_image_f32_init();
patch->nx = patch_size * num_patches;
patch->ny = patch_size;
patch->buf.resize(3 * patch->nx * patch->ny);
int patch_index = 0;
for (int i = 0; i < height; i += patch_size) {
for (int j = 0; j < width; j += patch_size) {
for (int pi = 0; pi < patch_size; ++pi) {
for (int pj = 0; pj < patch_size; ++pj) {
int input_index = ((i + pi) * width + (j + pj)) * 3;
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
patch->buf[output_index] = image->buf[input_index];
patch->buf[output_index+1] = image->buf[input_index+1];
patch->buf[output_index+2] = image->buf[input_index+2];
}
}
patch_index++;
}
}
return patch;
}
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
@ -209,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
LOG_TEE("%s: unable to preprocess image\n", __func__);
LOG_ERR("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data;
return false;
}
@ -218,17 +255,62 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
if (clip_is_minicpmv(ctx_clip)) {
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
int patch_size=14;
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
n_img_pos_out += clip_n_patches(ctx_clip);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]);
}
image_embd_v.clear();
load_image_size->width = img->nx;
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
if (!encoded) {
LOG_TEE("Unable to encode image\n");
LOG_ERR("Unable to encode image\n");
return false;
}
} else {
}
else {
// spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v;
@ -237,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip);
@ -275,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
}
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true;
}
@ -290,22 +372,26 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false;
}
return true;
}
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
int num_max_patches = 6;
if (clip_is_minicpmv(ctx_clip)) {
num_max_patches = 10;
}
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n");
LOG_ERR("Unable to allocate memory for image embeddings\n");
return false;
}
int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
free(image_embd);
return false;
}
@ -325,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
@ -337,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL;
}
@ -346,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) {
clip_image_u8_free(img);
LOG_TEE("%s: coulnd't embed the image\n", __func__);
LOG_ERR("%s: coulnd't embed the image\n", __func__);
return NULL;
}
@ -360,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb");
if (file == NULL) {
LOG_TEE("%s: can't read file %s\n", __func__, path);
LOG_ERR("%s: can't read file %s\n", __func__, path);
return false;
}
@ -370,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) {
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error");
fclose(file);
return false;
@ -395,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
return NULL;
}

View file

@ -17,12 +17,11 @@
# define LLAVA_API
#endif
struct clip_ctx;
#ifdef __cplusplus
extern "C" {
#endif
struct clip_ctx;
struct llava_image_embed {
float * embed;
int n_image_pos;
@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
/** build an image embed from a path to an image filename */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** free an embedding made with llava_image_embed_make_* */
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);

View file

@ -0,0 +1,323 @@
#include "arg.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "ggml.h"
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <iostream> // TODO: remove me
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void show_additional_info(int /*argc*/, char ** argv) {
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llama_model * llava_init(gpt_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
}
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
ctx_params.n_ctx = 2048;
} else {
ctx_params.n_ctx = params->n_ctx;
}
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
static struct clip_ctx * clip_init_context(gpt_params * params) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip;
}
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
}
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
llava_image_embed_free(slice_embed);
}
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
std::string system_prompt;
int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (has_minicpmv_projector == 2) {
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
}
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (num_image_embeds > 1) {
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
for (size_t j = 0; j < num_image_embeds_col; ++j) {
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (j == num_image_embeds_col - 1) {
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
}
}
}
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
}
static const char * sample(struct gpt_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past) {
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
gpt_sampler_accept(smpl, id, true);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto * ctx_clip = clip_init_context(params);
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) {
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
return NULL;
}
// process the prompt
if (params->prompt.empty() && params->interactive == false) {
LOG_ERR("prompt should be given or interactive mode should be on");
return NULL;
}
auto * model = llava_init(params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL;
}
const int64_t t_llava_init_start_us = ggml_time_us();
auto * ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds);
return ctx_llava;
}
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
std::string user_prompt = prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
if (has_minicpmv_projector == 2) {
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
}
else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt;
}
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
if (has_minicpmv_projector == 2) {
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
// generate the response
LOG_INF("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
return smpl;
}
static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
return tmp;
}
int main(int argc, char ** argv) {
ggml_time_init();
gpt_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1;
}
gpt_init();
if (params.mmproj.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
return 1;
}
for (auto & image : params.image) {
int n_past = 0;
auto * ctx_llava = minicpmv_init(&params, image, n_past);
if (!params.prompt.empty()) {
LOG("<user>%s\n", params.prompt.c_str());
LOG("<assistant>");
auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response;
bool have_tmp = false;
for (int i = 0; i < max_tgt_len; i++) {
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0){
if (!have_tmp) {
continue;
}
break;
}
if (strstr(tmp, "###")) break; // Yi-VL behavior
have_tmp = true;
printf("%s", tmp);
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout);
}
gpt_sampler_free(smpl);
}else {
while (true) {
LOG("<user>");
std::string prompt;
std::getline(std::cin, prompt);
LOG("<assistant>");
auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response;
for (int i = 0; i < max_tgt_len; i++) {
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);// mistral llava-1.6
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout);
}
gpt_sampler_free(smpl);
}
}
printf("\n");
llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
return 0;
}

Some files were not shown because too many files have changed in this diff Show more