Compare commits
1 commit
master
...
codeplay/s
Author | SHA1 | Date | |
---|---|---|---|
|
eab4a88210 |
837 changed files with 104853 additions and 186987 deletions
161
.clang-format
161
.clang-format
|
@ -1,161 +0,0 @@
|
||||||
---
|
|
||||||
Language: Cpp
|
|
||||||
AlignAfterOpenBracket: Align
|
|
||||||
AlignArrayOfStructures: Left
|
|
||||||
AlignConsecutiveAssignments: AcrossComments
|
|
||||||
AlignConsecutiveBitFields: AcrossComments
|
|
||||||
AlignConsecutiveDeclarations: AcrossComments
|
|
||||||
AlignConsecutiveMacros: AcrossComments
|
|
||||||
# AlignConsecutiveShortCaseStatements: AcrossComments
|
|
||||||
AlignEscapedNewlines: Left # LeftWithLastLine
|
|
||||||
AlignOperands: Align
|
|
||||||
AlignTrailingComments:
|
|
||||||
Kind: Always
|
|
||||||
OverEmptyLines: 1
|
|
||||||
AllowAllArgumentsOnNextLine: true
|
|
||||||
AllowAllParametersOfDeclarationOnNextLine: false
|
|
||||||
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
|
|
||||||
AllowShortBlocksOnASingleLine: Never
|
|
||||||
AllowShortCaseLabelsOnASingleLine: false
|
|
||||||
AllowShortFunctionsOnASingleLine: Inline
|
|
||||||
AllowShortIfStatementsOnASingleLine: Never
|
|
||||||
AllowShortLambdasOnASingleLine: Inline
|
|
||||||
AllowShortLoopsOnASingleLine: false
|
|
||||||
AlwaysBreakBeforeMultilineStrings: true
|
|
||||||
BinPackArguments: true
|
|
||||||
BinPackParameters: true # OnePerLine
|
|
||||||
BitFieldColonSpacing: Both
|
|
||||||
BreakBeforeBraces: Custom # Attach
|
|
||||||
BraceWrapping:
|
|
||||||
AfterCaseLabel: true
|
|
||||||
AfterClass: false
|
|
||||||
AfterControlStatement: false
|
|
||||||
AfterEnum: false
|
|
||||||
AfterFunction: false
|
|
||||||
AfterNamespace: false
|
|
||||||
AfterObjCDeclaration: false
|
|
||||||
AfterStruct: false
|
|
||||||
AfterUnion: false
|
|
||||||
AfterExternBlock: false
|
|
||||||
BeforeCatch: false
|
|
||||||
BeforeElse: false
|
|
||||||
BeforeLambdaBody: false
|
|
||||||
BeforeWhile: false
|
|
||||||
IndentBraces: false
|
|
||||||
SplitEmptyFunction: false
|
|
||||||
SplitEmptyRecord: false
|
|
||||||
SplitEmptyNamespace: false
|
|
||||||
# BreakAdjacentStringLiterals: true
|
|
||||||
BreakAfterAttributes: Never
|
|
||||||
BreakBeforeBinaryOperators: None
|
|
||||||
BreakBeforeInlineASMColon: OnlyMultiline
|
|
||||||
BreakBeforeTernaryOperators: false
|
|
||||||
# BreakBinaryOperations: Never
|
|
||||||
BreakConstructorInitializers: AfterColon
|
|
||||||
# BreakFunctionDefinitionParameters: false
|
|
||||||
BreakInheritanceList: AfterComma
|
|
||||||
BreakStringLiterals: true
|
|
||||||
# BreakTemplateDeclarations: Yes
|
|
||||||
ColumnLimit: 120
|
|
||||||
CommentPragmas: '^ IWYU pragma:'
|
|
||||||
CompactNamespaces: false
|
|
||||||
ConstructorInitializerIndentWidth: 4
|
|
||||||
ContinuationIndentWidth: 4
|
|
||||||
Cpp11BracedListStyle: false
|
|
||||||
DerivePointerAlignment: false
|
|
||||||
DisableFormat: false
|
|
||||||
EmptyLineBeforeAccessModifier: Leave
|
|
||||||
EmptyLineAfterAccessModifier: Never
|
|
||||||
ExperimentalAutoDetectBinPacking: false
|
|
||||||
FixNamespaceComments: true
|
|
||||||
IncludeBlocks: Regroup
|
|
||||||
IncludeCategories:
|
|
||||||
- Regex: '^<.*\.h>'
|
|
||||||
Priority: 1
|
|
||||||
SortPriority: 0
|
|
||||||
- Regex: '^<.*'
|
|
||||||
Priority: 2
|
|
||||||
SortPriority: 0
|
|
||||||
- Regex: '.*'
|
|
||||||
Priority: 3
|
|
||||||
SortPriority: 0
|
|
||||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
|
||||||
IncludeIsMainSourceRegex: ''
|
|
||||||
IndentAccessModifiers: false
|
|
||||||
IndentCaseBlocks: true
|
|
||||||
IndentCaseLabels: true
|
|
||||||
IndentExternBlock: NoIndent
|
|
||||||
IndentGotoLabels: false
|
|
||||||
IndentPPDirectives: AfterHash
|
|
||||||
IndentWidth: 4
|
|
||||||
IndentWrappedFunctionNames: false
|
|
||||||
InsertBraces: true # NOTE: may lead to incorrect formatting
|
|
||||||
InsertNewlineAtEOF: true
|
|
||||||
JavaScriptQuotes: Leave
|
|
||||||
JavaScriptWrapImports: true
|
|
||||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
|
||||||
LambdaBodyIndentation: Signature
|
|
||||||
LineEnding: LF
|
|
||||||
MacroBlockBegin: ''
|
|
||||||
MacroBlockEnd: ''
|
|
||||||
MaxEmptyLinesToKeep: 1
|
|
||||||
NamespaceIndentation: None
|
|
||||||
ObjCBinPackProtocolList: Auto
|
|
||||||
ObjCBlockIndentWidth: 4
|
|
||||||
ObjCSpaceAfterProperty: true
|
|
||||||
ObjCSpaceBeforeProtocolList: true
|
|
||||||
PPIndentWidth: -1
|
|
||||||
PackConstructorInitializers: CurrentLine
|
|
||||||
PenaltyBreakAssignment: 2
|
|
||||||
PenaltyBreakBeforeFirstCallParameter: 1
|
|
||||||
PenaltyBreakComment: 300
|
|
||||||
PenaltyBreakFirstLessLess: 120
|
|
||||||
PenaltyBreakString: 1000
|
|
||||||
PenaltyBreakTemplateDeclaration: 10
|
|
||||||
PenaltyExcessCharacter: 1000000
|
|
||||||
PenaltyReturnTypeOnItsOwnLine: 200
|
|
||||||
PointerAlignment: Middle
|
|
||||||
QualifierAlignment: Left
|
|
||||||
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
|
|
||||||
RawStringFormats:
|
|
||||||
- Language: Cpp
|
|
||||||
Delimiters:
|
|
||||||
- cc
|
|
||||||
- CC
|
|
||||||
- cpp
|
|
||||||
- Cpp
|
|
||||||
- CPP
|
|
||||||
- 'c++'
|
|
||||||
- 'C++'
|
|
||||||
CanonicalDelimiter: ''
|
|
||||||
ReferenceAlignment: Middle
|
|
||||||
ReflowComments: false # IndentOnly
|
|
||||||
SeparateDefinitionBlocks: Always
|
|
||||||
SortIncludes: CaseInsensitive
|
|
||||||
SortUsingDeclarations: LexicographicNumeric
|
|
||||||
SpaceAfterCStyleCast: true
|
|
||||||
SpaceAfterLogicalNot: false
|
|
||||||
SpaceAfterTemplateKeyword: true
|
|
||||||
SpaceBeforeAssignmentOperators: true
|
|
||||||
SpaceBeforeCpp11BracedList: false
|
|
||||||
SpaceBeforeCtorInitializerColon: true
|
|
||||||
SpaceBeforeInheritanceColon: true
|
|
||||||
SpaceBeforeParens: ControlStatements
|
|
||||||
SpaceBeforeRangeBasedForLoopColon: true
|
|
||||||
SpaceInEmptyBlock: false
|
|
||||||
SpaceInEmptyParentheses: false
|
|
||||||
SpacesBeforeTrailingComments: 2
|
|
||||||
SpacesInAngles: Never
|
|
||||||
SpacesInContainerLiterals: true
|
|
||||||
SpacesInLineCommentPrefix:
|
|
||||||
Minimum: 1
|
|
||||||
Maximum: -1
|
|
||||||
SpacesInParentheses: false
|
|
||||||
SpacesInSquareBrackets: false
|
|
||||||
SpaceBeforeSquareBrackets: false
|
|
||||||
Standard: c++17
|
|
||||||
TabWidth: 4
|
|
||||||
UseTab: Never
|
|
||||||
WhitespaceSensitiveMacros: ['STRINGIZE']
|
|
||||||
...
|
|
||||||
|
|
|
@ -17,10 +17,8 @@ Checks: >
|
||||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||||
performance-*,
|
performance-*,
|
||||||
portability-*,
|
portability-*,
|
||||||
-portability-simd-intrinsics,
|
|
||||||
misc-*,
|
misc-*,
|
||||||
-misc-const-correctness,
|
-misc-const-correctness,
|
||||||
-misc-non-private-member-variables-in-classes,
|
-misc-non-private-member-variables-in-classes,
|
||||||
-misc-no-recursion,
|
-misc-no-recursion,
|
||||||
-misc-use-anonymous-namespace,
|
|
||||||
FormatStyle: none
|
FormatStyle: none
|
||||||
|
|
|
@ -1,92 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
ARG TARGETARCH
|
|
||||||
|
|
||||||
ARG GGML_CPU_ARM_ARCH=armv8-a
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
|
||||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
|
||||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
|
||||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
|
||||||
else \
|
|
||||||
echo "Unsupported architecture"; \
|
|
||||||
exit 1; \
|
|
||||||
fi && \
|
|
||||||
cmake --build build -j $(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -1,94 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
36
.devops/full-cuda.Dockerfile
Normal file
36
.devops/full-cuda.Dockerfile
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable CUDA
|
||||||
|
ENV GGML_CUDA=1
|
||||||
|
# Enable cURL
|
||||||
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
50
.devops/full-rocm.Dockerfile
Normal file
50
.devops/full-rocm.Dockerfile
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV GGML_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
# Enable cURL
|
||||||
|
ENV LLAMA_CURL=1
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
25
.devops/full.Dockerfile
Normal file
25
.devops/full.Dockerfile
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
|
||||||
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,91 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
|
||||||
|
|
||||||
## Build Image
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" \
|
|
||||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with dynamic libs" && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
|
|
||||||
|
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN yum install -y gcc g++ cmake make
|
|
||||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
|
||||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
|
||||||
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
|
|
||||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
|
|
||||||
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
|
||||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
|
||||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
|
|
||||||
# find libascend_hal.so, because the drive hasn`t been mounted.
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
|
||||||
|
|
||||||
RUN echo "Building with static libs" && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
# TODO: use image with NNRT
|
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
|
||||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
|
||||||
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
|
|
||||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
|
|
||||||
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
|
||||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
|
||||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
|
|
||||||
ENTRYPOINT ["/llama-cli" ]
|
|
35
.devops/llama-cli-cuda.Dockerfile
Normal file
35
.devops/llama-cli-cuda.Dockerfile
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable CUDA
|
||||||
|
ENV GGML_CUDA=1
|
||||||
|
|
||||||
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
28
.devops/llama-cli-intel.Dockerfile
Normal file
28
.devops/llama-cli-intel.Dockerfile
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||||
|
|
||||||
|
ARG GGML_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "GGML_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
echo "Building with static libs" && \
|
||||||
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
||||||
|
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
||||||
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
45
.devops/llama-cli-rocm.Dockerfile
Normal file
45
.devops/llama-cli-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV GGML_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
27
.devops/llama-cli-vulkan.Dockerfile
Normal file
27
.devops/llama-cli-vulkan.Dockerfile
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
||||||
|
|
||||||
|
# Install Vulkan SDK
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN cmake -B build -DGGML_VULKAN=1 && \
|
||||||
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
WORKDIR /
|
||||||
|
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
||||||
|
rm -rf /app
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
23
.devops/llama-cli.Dockerfile
Normal file
23
.devops/llama-cli.Dockerfile
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
39
.devops/llama-server-cuda.Dockerfile
Normal file
39
.devops/llama-server-cuda.Dockerfile
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable CUDA
|
||||||
|
ENV GGML_CUDA=1
|
||||||
|
# Enable cURL
|
||||||
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
32
.devops/llama-server-intel.Dockerfile
Normal file
32
.devops/llama-server-intel.Dockerfile
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||||
|
|
||||||
|
ARG GGML_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "GGML_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
echo "Building with dynamic libs" && \
|
||||||
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
52
.devops/llama-server-rocm.Dockerfile
Normal file
52
.devops/llama-server-rocm.Dockerfile
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV GGML_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
# Enable cURL
|
||||||
|
ENV LLAMA_CURL=1
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
29
.devops/llama-server-vulkan.Dockerfile
Normal file
29
.devops/llama-server-vulkan.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK and cURL
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
WORKDIR /
|
||||||
|
RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
|
rm -rf /app
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
27
.devops/llama-server.Dockerfile
Normal file
27
.devops/llama-server.Dockerfile
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,108 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y \
|
|
||||||
build-essential \
|
|
||||||
cmake \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
git \
|
|
||||||
libcurl4-openssl-dev \
|
|
||||||
libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -1,52 +1,13 @@
|
||||||
{ inputs, ... }:
|
|
||||||
|
|
||||||
{
|
{
|
||||||
perSystem =
|
perSystem =
|
||||||
{
|
{ config, lib, ... }:
|
||||||
config,
|
|
||||||
lib,
|
|
||||||
system,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
{
|
{
|
||||||
devShells =
|
devShells =
|
||||||
let
|
lib.concatMapAttrs
|
||||||
pkgs = import inputs.nixpkgs { inherit system; };
|
(name: package: {
|
||||||
stdenv = pkgs.stdenv;
|
${name} = package.passthru.shell;
|
||||||
scripts = config.packages.python-scripts;
|
${name + "-extra"} = package.passthru.shell-extra;
|
||||||
in
|
})
|
||||||
lib.pipe (config.packages) [
|
config.packages;
|
||||||
(lib.concatMapAttrs (
|
|
||||||
name: package: {
|
|
||||||
${name} = pkgs.mkShell {
|
|
||||||
name = "${name}";
|
|
||||||
inputsFrom = [ package ];
|
|
||||||
shellHook = ''
|
|
||||||
echo "Entering ${name} devShell"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
"${name}-extra" =
|
|
||||||
if (name == "python-scripts") then
|
|
||||||
null
|
|
||||||
else
|
|
||||||
pkgs.mkShell {
|
|
||||||
name = "${name}-extra";
|
|
||||||
inputsFrom = [
|
|
||||||
package
|
|
||||||
scripts
|
|
||||||
];
|
|
||||||
# Extra packages that *may* be used by some scripts
|
|
||||||
packages = [
|
|
||||||
pkgs.python3Packages.tiktoken
|
|
||||||
];
|
|
||||||
shellHook = ''
|
|
||||||
echo "Entering ${name} devShell"
|
|
||||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
}
|
|
||||||
))
|
|
||||||
(lib.filterAttrs (name: value: value != null))
|
|
||||||
];
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,14 +26,16 @@
|
||||||
config.cudaSupport = true;
|
config.cudaSupport = true;
|
||||||
config.allowUnfreePredicate =
|
config.allowUnfreePredicate =
|
||||||
p:
|
p:
|
||||||
builtins.all (
|
builtins.all
|
||||||
|
(
|
||||||
license:
|
license:
|
||||||
license.free
|
license.free
|
||||||
|| builtins.elem license.shortName [
|
|| builtins.elem license.shortName [
|
||||||
"CUDA EULA"
|
"CUDA EULA"
|
||||||
"cuDNN EULA"
|
"cuDNN EULA"
|
||||||
]
|
]
|
||||||
) (p.meta.licenses or [ p.meta.license ]);
|
)
|
||||||
|
(p.meta.licenses or [ p.meta.license ]);
|
||||||
};
|
};
|
||||||
# Ensure dependencies use ROCm consistently
|
# Ensure dependencies use ROCm consistently
|
||||||
pkgsRocm = import inputs.nixpkgs {
|
pkgsRocm = import inputs.nixpkgs {
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
llamaVersion,
|
|
||||||
numpy,
|
|
||||||
tqdm,
|
|
||||||
sentencepiece,
|
|
||||||
pyyaml,
|
|
||||||
poetry-core,
|
|
||||||
buildPythonPackage,
|
|
||||||
pytestCheckHook,
|
|
||||||
}:
|
|
||||||
|
|
||||||
buildPythonPackage {
|
|
||||||
pname = "gguf";
|
|
||||||
version = llamaVersion;
|
|
||||||
pyproject = true;
|
|
||||||
nativeBuildInputs = [ poetry-core ];
|
|
||||||
propagatedBuildInputs = [
|
|
||||||
numpy
|
|
||||||
tqdm
|
|
||||||
sentencepiece
|
|
||||||
pyyaml
|
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../gguf-py;
|
|
||||||
pythonImportsCheck = [
|
|
||||||
"numpy"
|
|
||||||
"gguf"
|
|
||||||
];
|
|
||||||
nativeCheckInputs = [ pytestCheckHook ];
|
|
||||||
doCheck = true;
|
|
||||||
meta = with lib; {
|
|
||||||
description = "Python package for writing binary files in the GGUF format";
|
|
||||||
license = licenses.mit;
|
|
||||||
maintainers = [ maintainers.ditsuke ];
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -3,35 +3,32 @@
|
||||||
glibc,
|
glibc,
|
||||||
config,
|
config,
|
||||||
stdenv,
|
stdenv,
|
||||||
|
mkShell,
|
||||||
runCommand,
|
runCommand,
|
||||||
cmake,
|
cmake,
|
||||||
ninja,
|
ninja,
|
||||||
pkg-config,
|
pkg-config,
|
||||||
git,
|
git,
|
||||||
|
python3,
|
||||||
mpi,
|
mpi,
|
||||||
blas,
|
blas,
|
||||||
cudaPackages,
|
cudaPackages,
|
||||||
autoAddDriverRunpath,
|
|
||||||
darwin,
|
darwin,
|
||||||
rocmPackages,
|
rocmPackages,
|
||||||
vulkan-headers,
|
vulkan-headers,
|
||||||
vulkan-loader,
|
vulkan-loader,
|
||||||
curl,
|
curl,
|
||||||
shaderc,
|
shaderc,
|
||||||
useBlas ?
|
useBlas ? builtins.all (x: !x) [
|
||||||
builtins.all (x: !x) [
|
|
||||||
useCuda
|
useCuda
|
||||||
useMetalKit
|
useMetalKit
|
||||||
useRocm
|
useRocm
|
||||||
useVulkan
|
useVulkan
|
||||||
]
|
] && blas.meta.available,
|
||||||
&& blas.meta.available,
|
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
||||||
# Increases the runtime closure size by ~700M
|
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||||
useMpi ? false,
|
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
|
|
||||||
enableCurl ? true,
|
enableCurl ? true,
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||||
|
@ -40,8 +37,8 @@
|
||||||
# otherwise we get libstdc++ errors downstream.
|
# otherwise we get libstdc++ errors downstream.
|
||||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
||||||
precompileMetalShaders ? false,
|
precompileMetalShaders ? false
|
||||||
}:
|
}@inputs:
|
||||||
|
|
||||||
let
|
let
|
||||||
inherit (lib)
|
inherit (lib)
|
||||||
|
@ -49,6 +46,7 @@ let
|
||||||
cmakeFeature
|
cmakeFeature
|
||||||
optionals
|
optionals
|
||||||
strings
|
strings
|
||||||
|
versionOlder
|
||||||
;
|
;
|
||||||
|
|
||||||
stdenv = throw "Use effectiveStdenv instead";
|
stdenv = throw "Use effectiveStdenv instead";
|
||||||
|
@ -64,9 +62,52 @@ let
|
||||||
pnameSuffix =
|
pnameSuffix =
|
||||||
strings.optionalString (suffices != [ ])
|
strings.optionalString (suffices != [ ])
|
||||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
||||||
descriptionSuffix = strings.optionalString (
|
descriptionSuffix =
|
||||||
suffices != [ ]
|
strings.optionalString (suffices != [ ])
|
||||||
) ", accelerated with ${strings.concatStringsSep ", " suffices}";
|
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||||
|
|
||||||
|
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
|
||||||
|
|
||||||
|
# TODO: package the Python in this repository in a Nix-like way.
|
||||||
|
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
||||||
|
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
||||||
|
# https://peps.python.org/pep-0517/
|
||||||
|
#
|
||||||
|
# TODO: Package up each Python script or service appropriately, by making
|
||||||
|
# them into "entrypoints"
|
||||||
|
llama-python = python3.withPackages (
|
||||||
|
ps: [
|
||||||
|
ps.numpy
|
||||||
|
ps.sentencepiece
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||||
|
llama-python-extra = python3.withPackages (
|
||||||
|
ps: [
|
||||||
|
ps.numpy
|
||||||
|
ps.sentencepiece
|
||||||
|
ps.tiktoken
|
||||||
|
ps.torchWithoutCuda
|
||||||
|
ps.transformers
|
||||||
|
|
||||||
|
# server bench
|
||||||
|
ps.matplotlib
|
||||||
|
|
||||||
|
# server tests
|
||||||
|
ps.openai
|
||||||
|
ps.behave
|
||||||
|
ps.prometheus-client
|
||||||
|
|
||||||
|
# for examples/pydantic-models-to-grammar-examples.py
|
||||||
|
ps.docstring-parser
|
||||||
|
ps.pydantic
|
||||||
|
|
||||||
|
# for scripts/compare-llama-bench.py
|
||||||
|
ps.gitpython
|
||||||
|
ps.tabulate
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
xcrunHost = runCommand "xcrunHost" {} ''
|
xcrunHost = runCommand "xcrunHost" {} ''
|
||||||
mkdir -p $out/bin
|
mkdir -p $out/bin
|
||||||
|
@ -85,9 +126,16 @@ let
|
||||||
++ optionals useMetalKit [ MetalKit ];
|
++ optionals useMetalKit [ MetalKit ];
|
||||||
|
|
||||||
cudaBuildInputs = with cudaPackages; [
|
cudaBuildInputs = with cudaPackages; [
|
||||||
cuda_cudart
|
cuda_cccl.dev # <nv/target>
|
||||||
cuda_cccl # <nv/target>
|
|
||||||
libcublas
|
# A temporary hack for reducing the closure size, remove once cudaPackages
|
||||||
|
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
||||||
|
cuda_cudart.dev
|
||||||
|
cuda_cudart.lib
|
||||||
|
cuda_cudart.static
|
||||||
|
libcublas.dev
|
||||||
|
libcublas.lib
|
||||||
|
libcublas.static
|
||||||
];
|
];
|
||||||
|
|
||||||
rocmBuildInputs = with rocmPackages; [
|
rocmBuildInputs = with rocmPackages; [
|
||||||
|
@ -103,7 +151,8 @@ let
|
||||||
];
|
];
|
||||||
in
|
in
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (finalAttrs: {
|
effectiveStdenv.mkDerivation (
|
||||||
|
finalAttrs: {
|
||||||
pname = "llama-cpp${pnameSuffix}";
|
pname = "llama-cpp${pnameSuffix}";
|
||||||
version = llamaVersion;
|
version = llamaVersion;
|
||||||
|
|
||||||
|
@ -127,9 +176,9 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
};
|
};
|
||||||
|
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
@ -151,10 +200,15 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
++ optionals useCuda [
|
++ optionals useCuda [
|
||||||
cudaPackages.cuda_nvcc
|
cudaPackages.cuda_nvcc
|
||||||
|
|
||||||
autoAddDriverRunpath
|
# TODO: Replace with autoAddDriverRunpath
|
||||||
|
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
||||||
|
cudaPackages.autoAddOpenGLRunpathHook
|
||||||
]
|
]
|
||||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
|
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
|
||||||
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
|
glibc.static
|
||||||
|
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
|
||||||
|
xcrunHost
|
||||||
|
];
|
||||||
|
|
||||||
buildInputs =
|
buildInputs =
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||||
|
@ -174,7 +228,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
(cmakeBool "GGML_HIP" useRocm)
|
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||||
(cmakeBool "GGML_STATIC" enableStatic)
|
(cmakeBool "GGML_STATIC" enableStatic)
|
||||||
|
@ -189,7 +243,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
]
|
]
|
||||||
++ optionals useRocm [
|
++ optionals useRocm [
|
||||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
||||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
|
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
||||||
]
|
]
|
||||||
++ optionals useMetalKit [
|
++ optionals useMetalKit [
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||||
|
@ -209,6 +263,35 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
cp $src/include/llama.h $out/include/
|
cp $src/include/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
||||||
|
passthru = {
|
||||||
|
inherit
|
||||||
|
useBlas
|
||||||
|
useCuda
|
||||||
|
useMetalKit
|
||||||
|
useMpi
|
||||||
|
useRocm
|
||||||
|
useVulkan
|
||||||
|
;
|
||||||
|
|
||||||
|
shell = mkShell {
|
||||||
|
name = "shell-${finalAttrs.finalPackage.name}";
|
||||||
|
description = "contains numpy and sentencepiece";
|
||||||
|
buildInputs = [ llama-python ];
|
||||||
|
inputsFrom = [ finalAttrs.finalPackage ];
|
||||||
|
shellHook = ''
|
||||||
|
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
shell-extra = mkShell {
|
||||||
|
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
||||||
|
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
||||||
|
buildInputs = [ llama-python-extra ];
|
||||||
|
inputsFrom = [ finalAttrs.finalPackage ];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
meta = {
|
meta = {
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
|
@ -244,4 +327,5 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
# Extend `badPlatforms` instead
|
# Extend `badPlatforms` instead
|
||||||
platforms = lib.platforms.all;
|
platforms = lib.platforms.all;
|
||||||
};
|
};
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
stdenv,
|
|
||||||
buildPythonPackage,
|
|
||||||
poetry-core,
|
|
||||||
mkShell,
|
|
||||||
python3Packages,
|
|
||||||
gguf-py,
|
|
||||||
}@inputs:
|
|
||||||
|
|
||||||
let
|
|
||||||
llama-python-deps = with python3Packages; [
|
|
||||||
numpy
|
|
||||||
sentencepiece
|
|
||||||
transformers
|
|
||||||
protobuf
|
|
||||||
torchWithoutCuda
|
|
||||||
gguf-py
|
|
||||||
tqdm
|
|
||||||
|
|
||||||
# for scripts/compare-llama-bench.py
|
|
||||||
gitpython
|
|
||||||
tabulate
|
|
||||||
|
|
||||||
# for examples/pydantic-models-to-grammar-examples.py
|
|
||||||
docstring-parser
|
|
||||||
pydantic
|
|
||||||
|
|
||||||
];
|
|
||||||
|
|
||||||
llama-python-test-deps = with python3Packages; [
|
|
||||||
# Server bench
|
|
||||||
matplotlib
|
|
||||||
|
|
||||||
# server tests
|
|
||||||
openai
|
|
||||||
pytest
|
|
||||||
prometheus-client
|
|
||||||
];
|
|
||||||
in
|
|
||||||
|
|
||||||
buildPythonPackage ({
|
|
||||||
pname = "llama-scripts";
|
|
||||||
version = "0.0.0";
|
|
||||||
pyproject = true;
|
|
||||||
|
|
||||||
# NOTE: The files filtered out here are not visible in the build sandbox, neither
|
|
||||||
# do they affect the output hash. They can be modified without triggering a rebuild.
|
|
||||||
src = lib.cleanSourceWith {
|
|
||||||
filter =
|
|
||||||
name: type:
|
|
||||||
let
|
|
||||||
any = builtins.any (x: x);
|
|
||||||
baseName = builtins.baseNameOf name;
|
|
||||||
in
|
|
||||||
any [
|
|
||||||
(lib.hasSuffix ".py" name)
|
|
||||||
(baseName == "README.md")
|
|
||||||
(baseName == "pyproject.toml")
|
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
nativeBuildInputs = [ poetry-core ];
|
|
||||||
nativeCheckInputs = llama-python-test-deps;
|
|
||||||
dependencies = llama-python-deps;
|
|
||||||
})
|
|
|
@ -1,41 +1,19 @@
|
||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
newScope,
|
newScope,
|
||||||
python3,
|
|
||||||
llamaVersion ? "0.0.0",
|
llamaVersion ? "0.0.0",
|
||||||
}:
|
}:
|
||||||
|
|
||||||
let
|
|
||||||
pythonPackages = python3.pkgs;
|
|
||||||
buildPythonPackage = pythonPackages.buildPythonPackage;
|
|
||||||
numpy = pythonPackages.numpy;
|
|
||||||
tqdm = pythonPackages.tqdm;
|
|
||||||
sentencepiece = pythonPackages.sentencepiece;
|
|
||||||
pyyaml = pythonPackages.pyyaml;
|
|
||||||
poetry-core = pythonPackages.poetry-core;
|
|
||||||
pytestCheckHook = pythonPackages.pytestCheckHook;
|
|
||||||
in
|
|
||||||
|
|
||||||
# We're using `makeScope` instead of just writing out an attrset
|
# We're using `makeScope` instead of just writing out an attrset
|
||||||
# because it allows users to apply overlays later using `overrideScope'`.
|
# because it allows users to apply overlays later using `overrideScope'`.
|
||||||
# Cf. https://noogle.dev/f/lib/makeScope
|
# Cf. https://noogle.dev/f/lib/makeScope
|
||||||
|
|
||||||
lib.makeScope newScope (self: {
|
lib.makeScope newScope (
|
||||||
|
self: {
|
||||||
inherit llamaVersion;
|
inherit llamaVersion;
|
||||||
gguf-py = self.callPackage ./package-gguf-py.nix {
|
|
||||||
inherit
|
|
||||||
buildPythonPackage
|
|
||||||
numpy
|
|
||||||
tqdm
|
|
||||||
sentencepiece
|
|
||||||
poetry-core
|
|
||||||
pyyaml
|
|
||||||
pytestCheckHook
|
|
||||||
;
|
|
||||||
};
|
|
||||||
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
llama-cpp = self.callPackage ./package.nix { };
|
||||||
docker = self.callPackage ./docker.nix { };
|
docker = self.callPackage ./docker.nix { };
|
||||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||||
sif = self.callPackage ./sif.nix { };
|
sif = self.callPackage ./sif.nix { };
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
|
@ -1,113 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=24.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=6.3
|
|
||||||
ARG AMDGPU_VERSION=6.3
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
### Build image
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
|
|
||||||
# gfx906 is deprecated
|
|
||||||
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
|
|
||||||
|
|
||||||
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
|
||||||
ARG ROCM_DOCKER_ARCH=gfx1100
|
|
||||||
|
|
||||||
# Set nvcc architectured
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
# ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
# ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
build-essential \
|
|
||||||
cmake \
|
|
||||||
git \
|
|
||||||
libcurl4-openssl-dev \
|
|
||||||
curl \
|
|
||||||
libgomp1
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
|
||||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
|
||||||
&& cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib \
|
|
||||||
&& find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3-pip \
|
|
||||||
python3 \
|
|
||||||
python3-wheel\
|
|
||||||
&& pip install --break-system-packages --upgrade setuptools \
|
|
||||||
&& pip install --break-system-packages -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -8,36 +8,28 @@ arg1="$1"
|
||||||
shift
|
shift
|
||||||
|
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
exec python3 ./convert_hf_to_gguf.py "$@"
|
python3 ./convert_hf_to_gguf.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
exec ./llama-quantize "$@"
|
./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
exec ./llama-cli "$@"
|
./llama-cli "$@"
|
||||||
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
|
||||||
exec ./llama-bench "$@"
|
|
||||||
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
|
||||||
exec ./llama-perplexity "$@"
|
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in $(ls $1/$2/ggml-model-f16.bin*); do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
if [ -f "${i/f16/q4_0}" ]; then
|
if [ -f "${i/f16/q4_0}" ]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
exec ./llama-server "$@"
|
./llama-server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
echo " --run (-r): Run a model previously converted into ggml"
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
|
||||||
echo " ex: -m model.gguf"
|
|
||||||
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
|
||||||
echo " ex: -m model.gguf -f file.txt"
|
|
||||||
echo " --convert (-c): Convert a llama model into ggml"
|
echo " --convert (-c): Convert a llama model into ggml"
|
||||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
|
|
|
@ -1,89 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=24.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
|
||||||
|
|
||||||
# Install Vulkan SDK and cURL
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl libvulkan-dev \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
python3-wheel \
|
|
||||||
&& pip install --break-system-packages --upgrade setuptools \
|
|
||||||
&& pip install --break-system-packages -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -1,7 +1,7 @@
|
||||||
*.o
|
*.o
|
||||||
*.a
|
*.a
|
||||||
.cache/
|
.cache/
|
||||||
# Do not ignore .git directory, otherwise the reported build number will always be 0
|
.git/
|
||||||
.github/
|
.github/
|
||||||
.gitignore
|
.gitignore
|
||||||
.vs/
|
.vs/
|
||||||
|
|
2
.ecrc
2
.ecrc
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
|
"Exclude": ["^\\.gitmodules$"],
|
||||||
"Disable": {
|
"Disable": {
|
||||||
"IndentSize": true
|
"IndentSize": true
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,27 +24,9 @@ insert_final_newline = unset
|
||||||
[examples/server/public/*]
|
[examples/server/public/*]
|
||||||
indent_size = 2
|
indent_size = 2
|
||||||
|
|
||||||
[examples/server/public/deps_*]
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
|
|
||||||
[examples/server/deps_*]
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
[examples/cvector-generator/*.txt]
|
[examples/cvector-generator/*.txt]
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
||||||
[models/templates/*.jinja]
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
end_of_line = unset
|
|
||||||
charset = unset
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
insert_final_newline = unset
|
|
||||||
|
|
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Low Severity Bugs
|
||||||
|
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "low severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
87
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
87
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
|
@ -1,87 +0,0 @@
|
||||||
name: Bug (compilation)
|
|
||||||
description: Something goes wrong when trying to compile llama.cpp.
|
|
||||||
title: "Compile bug: "
|
|
||||||
labels: ["bug-unconfirmed", "compilation"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for bug reports where the compilation of llama.cpp fails.
|
|
||||||
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
|
|
||||||
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
|
||||||
by clearing `~/.cache/ccache` (on Linux).
|
|
||||||
- type: textarea
|
|
||||||
id: commit
|
|
||||||
attributes:
|
|
||||||
label: Git commit
|
|
||||||
description: Which commit are you trying to compile?
|
|
||||||
placeholder: |
|
|
||||||
$git rev-parse HEAD
|
|
||||||
84a07a17b1b08cf2b9747c633a2372782848a27f
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: backends
|
|
||||||
attributes:
|
|
||||||
label: GGML backends
|
|
||||||
description: Which GGML backends do you know to be affected?
|
|
||||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
|
||||||
multiple: true
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
|
||||||
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
|
|
||||||
placeholder: >
|
|
||||||
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
|
|
||||||
Here are the exact commands that I used: ...
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: command
|
|
||||||
attributes:
|
|
||||||
label: Compile command
|
|
||||||
description: >
|
|
||||||
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
Please copy and paste any relevant log output, including any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
|
@ -1,101 +0,0 @@
|
||||||
name: Bug (model use)
|
|
||||||
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
|
|
||||||
title: "Eval bug: "
|
|
||||||
labels: ["bug-unconfirmed", "model evaluation"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for bug reports where the model evaluation results
|
|
||||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
|
||||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: backends
|
|
||||||
attributes:
|
|
||||||
label: GGML backends
|
|
||||||
description: Which GGML backends do you know to be affected?
|
|
||||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
|
||||||
multiple: true
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: hardware
|
|
||||||
attributes:
|
|
||||||
label: Hardware
|
|
||||||
description: Which CPUs/GPUs are you using?
|
|
||||||
placeholder: >
|
|
||||||
e.g. Ryzen 5950X + 2x RTX 4090
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: model
|
|
||||||
attributes:
|
|
||||||
label: Models
|
|
||||||
description: >
|
|
||||||
Which model(s) at which quantization were you using when encountering the bug?
|
|
||||||
If you downloaded a GGUF file off of Huggingface, please provide a link.
|
|
||||||
placeholder: >
|
|
||||||
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
|
||||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
|
||||||
that information would be very much appreciated by us.
|
|
||||||
placeholder: >
|
|
||||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
|
||||||
When I use -ngl 0 it works correctly.
|
|
||||||
Here are the exact commands that I used: ...
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
91
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
91
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
|
@ -1,91 +0,0 @@
|
||||||
name: Bug (misc.)
|
|
||||||
description: Something is not working the way it should (and it's not covered by any of the above cases).
|
|
||||||
title: "Misc. bug: "
|
|
||||||
labels: ["bug-unconfirmed"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for miscellaneous bugs that don't fit into any other category.
|
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which version of our software is affected? (You can use `--version` to get a version string.)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: dropdown
|
|
||||||
id: module
|
|
||||||
attributes:
|
|
||||||
label: Which llama.cpp modules do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Documentation/Github
|
|
||||||
- libllama (core library)
|
|
||||||
- llama-cli
|
|
||||||
- llama-server
|
|
||||||
- llama-bench
|
|
||||||
- llama-quantize
|
|
||||||
- Python/Bash scripts
|
|
||||||
- Test code
|
|
||||||
- Other (Please specify in the next section)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: command
|
|
||||||
attributes:
|
|
||||||
label: Command line
|
|
||||||
description: >
|
|
||||||
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it (if applicable).
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: false
|
|
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Medium Severity Bug
|
||||||
|
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "medium severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: High Severity Bug
|
||||||
|
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "high severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Critical Severity Bug
|
||||||
|
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "critical severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
|
@ -1,5 +1,5 @@
|
||||||
name: Enhancement
|
name: Enhancement
|
||||||
description: Used to request enhancements for llama.cpp.
|
description: Used to request enhancements for llama.cpp
|
||||||
title: "Feature Request: "
|
title: "Feature Request: "
|
||||||
labels: ["enhancement"]
|
labels: ["enhancement"]
|
||||||
body:
|
body:
|
|
@ -1,5 +1,5 @@
|
||||||
name: Research
|
name: Research
|
||||||
description: Track new technical research area.
|
description: Track new technical research area
|
||||||
title: "Research: "
|
title: "Research: "
|
||||||
labels: ["research 🔬"]
|
labels: ["research 🔬"]
|
||||||
body:
|
body:
|
|
@ -1,5 +1,5 @@
|
||||||
name: Refactor (Maintainers)
|
name: Refactor (Maintainers)
|
||||||
description: Used to track refactoring opportunities.
|
description: Used to track refactoring opportunities
|
||||||
title: "Refactor: "
|
title: "Refactor: "
|
||||||
labels: ["refactor"]
|
labels: ["refactor"]
|
||||||
body:
|
body:
|
15
.github/labeler.yml
vendored
15
.github/labeler.yml
vendored
|
@ -3,18 +3,19 @@ Kompute:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml-kompute.h
|
- ggml/include/ggml-kompute.h
|
||||||
- ggml/src/ggml-kompute/**
|
- ggml/src/ggml-kompute.cpp
|
||||||
- README-kompute.md
|
- README-kompute.md
|
||||||
Apple Metal:
|
Apple Metal:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml-metal.h
|
- ggml/include/ggml-metal.h
|
||||||
- ggml/src/ggml-metal/**
|
- ggml/src/ggml-metal.cpp
|
||||||
- README-metal.md
|
- README-metal.md
|
||||||
SYCL:
|
SYCL:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml-sycl.h
|
- ggml/include/ggml-sycl.h
|
||||||
|
- ggml/src/ggml-sycl.cpp
|
||||||
- ggml/src/ggml-sycl/**
|
- ggml/src/ggml-sycl/**
|
||||||
- docs/backend/SYCL.md
|
- docs/backend/SYCL.md
|
||||||
- examples/sycl/**
|
- examples/sycl/**
|
||||||
|
@ -26,8 +27,8 @@ Nvidia GPU:
|
||||||
Vulkan:
|
Vulkan:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml-vulkan.h
|
- ggml/ggml_vk_generate_shaders.py
|
||||||
- ggml/src/ggml-vulkan/**
|
- ggml/src/ggml-vulkan*
|
||||||
documentation:
|
documentation:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
@ -74,7 +75,11 @@ server:
|
||||||
ggml:
|
ggml:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/**
|
- ggml/include/ggml*.h
|
||||||
|
- ggml/src/ggml*.c
|
||||||
|
- ggml/src/ggml*.cpp
|
||||||
|
- ggml/src/ggml*.h
|
||||||
|
- ggml-cuda/**
|
||||||
nix:
|
nix:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
|
8
.github/pull_request_template.md
vendored
8
.github/pull_request_template.md
vendored
|
@ -1 +1,7 @@
|
||||||
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
|
|
||||||
|
|
||||||
|
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
||||||
|
- Self-reported review complexity:
|
||||||
|
- [ ] Low
|
||||||
|
- [ ] Medium
|
||||||
|
- [ ] High
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# TODO: there have been some issues with the workflow, so disabling for now
|
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/7893
|
|
||||||
#
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
name: Benchmark
|
name: Benchmark
|
||||||
|
|
||||||
|
@ -27,10 +24,10 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '04 2 * * *'
|
- cron: '04 2 * * *'
|
||||||
|
|
||||||
|
@ -132,8 +129,6 @@ jobs:
|
||||||
|
|
||||||
- name: Server bench
|
- name: Server bench
|
||||||
id: server_bench
|
id: server_bench
|
||||||
env:
|
|
||||||
HEAD_REF: ${{ github.head_ref || github.ref_name }}
|
|
||||||
run: |
|
run: |
|
||||||
set -eux
|
set -eux
|
||||||
|
|
||||||
|
@ -142,7 +137,7 @@ jobs:
|
||||||
python bench.py \
|
python bench.py \
|
||||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
--runner-label ${{ env.RUNNER_LABEL }} \
|
||||||
--name ${{ github.job }} \
|
--name ${{ github.job }} \
|
||||||
--branch $HEAD_REF \
|
--branch ${{ github.head_ref || github.ref_name }} \
|
||||||
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
||||||
--scenario script.js \
|
--scenario script.js \
|
||||||
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
853
.github/workflows/build.yml
vendored
853
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load diff
7
.github/workflows/close-issue.yml
vendored
7
.github/workflows/close-issue.yml
vendored
|
@ -3,11 +3,6 @@ on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "42 0 * * *"
|
- cron: "42 0 * * *"
|
||||||
|
|
||||||
# Fine-grant permission
|
|
||||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
||||||
permissions:
|
|
||||||
issues: write
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
close-issues:
|
close-issues:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -17,7 +12,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@v5
|
- uses: actions/stale@v5
|
||||||
with:
|
with:
|
||||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
|
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
|
||||||
days-before-issue-stale: 30
|
days-before-issue-stale: 30
|
||||||
days-before-issue-close: 14
|
days-before-issue-close: 14
|
||||||
stale-issue-label: "stale"
|
stale-issue-label: "stale"
|
||||||
|
|
163
.github/workflows/docker.yml
vendored
163
.github/workflows/docker.yml
vendored
|
@ -10,50 +10,48 @@
|
||||||
name: Publish Docker image
|
name: Publish Docker image
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # allows manual triggering
|
#pull_request:
|
||||||
schedule:
|
push:
|
||||||
# Rebuild daily rather than on every push because it is expensive
|
branches:
|
||||||
- cron: '12 4 * * *'
|
- master
|
||||||
|
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
# Fine-grant permission
|
|
||||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
||||||
permissions:
|
|
||||||
packages: write
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
push_to_registry:
|
push_to_registry:
|
||||||
name: Push Docker image to Docker Hub
|
name: Push Docker image to Docker Hub
|
||||||
|
#if: github.event.pull_request.draft == false
|
||||||
|
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
COMMIT_SHA: ${{ github.sha }}
|
COMMIT_SHA: ${{ github.sha }}
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
# Multi-stage build
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
||||||
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
|
||||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
- name: Log in to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
|
@ -62,45 +60,9 @@ jobs:
|
||||||
username: ${{ github.repository_owner }}
|
username: ${{ github.repository_owner }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Determine tag name
|
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
||||||
id: tag
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
|
||||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
|
||||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
|
||||||
REPO_NAME="${{ github.event.repository.name }}"
|
|
||||||
|
|
||||||
# determine tag name postfix (build number, commit hash)
|
|
||||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
|
||||||
TAG_POSTFIX="-b${BUILD_NUMBER}"
|
|
||||||
else
|
|
||||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
|
||||||
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
|
||||||
fi
|
|
||||||
# list all tags possible
|
|
||||||
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
|
||||||
TYPE=""
|
|
||||||
else
|
|
||||||
TYPE="-${{ matrix.config.tag }}"
|
|
||||||
fi
|
|
||||||
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
|
||||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
|
||||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
|
||||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
|
||||||
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "full_output_tags=$FULLTAGS" # print out for debugging
|
|
||||||
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
|
||||||
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
|
||||||
env:
|
|
||||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
|
||||||
|
|
||||||
- name: Free Disk Space (Ubuntu)
|
- name: Free Disk Space (Ubuntu)
|
||||||
if: ${{ matrix.config.free_disk_space == true }}
|
uses: jlumbroso/free-disk-space@main
|
||||||
uses: ggml-org/free-disk-space@v1.3.1
|
|
||||||
with:
|
with:
|
||||||
# this might remove tools that are actually needed,
|
# this might remove tools that are actually needed,
|
||||||
# if set to "true" but frees about 6 GB
|
# if set to "true" but frees about 6 GB
|
||||||
|
@ -115,59 +77,40 @@ jobs:
|
||||||
docker-images: true
|
docker-images: true
|
||||||
swap-storage: true
|
swap-storage: true
|
||||||
|
|
||||||
- name: Build and push Full Docker image (tagged + versioned)
|
- name: Determine tag name
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
id: tag
|
||||||
uses: docker/build-push-action@v6
|
shell: bash
|
||||||
with:
|
run: |
|
||||||
context: .
|
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||||
push: true
|
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||||
platforms: ${{ matrix.config.platforms }}
|
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||||
# tag list is generated from step above
|
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||||
tags: ${{ steps.tag.outputs.full_output_tags }}
|
else
|
||||||
file: ${{ matrix.config.dockerfile }}
|
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||||
target: full
|
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||||
provenance: false
|
fi
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
||||||
- name: Build and push Light Docker image (tagged + versioned)
|
- name: Downcase github.repository_owner
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
run: |
|
||||||
uses: docker/build-push-action@v6
|
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
|
||||||
with:
|
env:
|
||||||
context: .
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
push: true
|
|
||||||
platforms: ${{ matrix.config.platforms }}
|
|
||||||
# tag list is generated from step above
|
|
||||||
tags: ${{ steps.tag.outputs.light_output_tags }}
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
|
||||||
target: light
|
|
||||||
provenance: false
|
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
||||||
- name: Build and push Server Docker image (tagged + versioned)
|
- name: Build and push Docker image (versioned)
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
if: github.event_name == 'push'
|
||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
# tag list is generated from step above
|
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
tags: ${{ steps.tag.outputs.server_output_tags }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (tagged)
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: ${{ github.event_name == 'push' }}
|
||||||
|
platforms: ${{ matrix.config.platforms }}
|
||||||
|
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
target: server
|
|
||||||
provenance: false
|
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
4
.github/workflows/editorconfig.yml
vendored
4
.github/workflows/editorconfig.yml
vendored
|
@ -23,7 +23,5 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
- uses: editorconfig-checker/action-editorconfig-checker@main
|
||||||
with:
|
|
||||||
version: v3.0.3
|
|
||||||
- run: editorconfig-checker
|
- run: editorconfig-checker
|
||||||
|
|
65
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
65
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
name: Nix aarch64 builds
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
schedule:
|
||||||
|
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
||||||
|
# 1.5h instead of minutes with the cold cache).
|
||||||
|
#
|
||||||
|
# randint(0, 59), randint(0, 23)
|
||||||
|
- cron: '26 12 * * *'
|
||||||
|
# But also rebuild if we touched any of the Nix expressions:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: ['**/*.nix', 'flake.lock']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: ['**/*.nix', 'flake.lock']
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
nix-build-aarch64:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install QEMU
|
||||||
|
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||||
|
sudo usermod -a -G kvm $USER
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-platforms = aarch64-linux
|
||||||
|
extra-system-features = nixos-test kvm
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: Set-up cachix to push the results to
|
||||||
|
uses: cachix/cachix-action@v13
|
||||||
|
with:
|
||||||
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
|
name: llama-cpp
|
||||||
|
- name: Show all output paths
|
||||||
|
run: >
|
||||||
|
nix run github:nix-community/nix-eval-jobs
|
||||||
|
-- --gc-roots-dir gcroot
|
||||||
|
--flake
|
||||||
|
".#packages.aarch64-linux"
|
||||||
|
- name: Build
|
||||||
|
run: >
|
||||||
|
nix run github:Mic92/nix-fast-build
|
||||||
|
-- --skip-cached --no-nom
|
||||||
|
--systems aarch64-linux
|
||||||
|
--flake
|
||||||
|
".#checks.aarch64-linux"
|
72
.github/workflows/nix-ci.yml
vendored
Normal file
72
.github/workflows/nix-ci.yml
vendored
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
name: Nix CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
nix-eval:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ ubuntu-latest, macos-latest ]
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: List all flake outputs
|
||||||
|
run: nix flake show --all-systems
|
||||||
|
- name: Show all output paths
|
||||||
|
run: >
|
||||||
|
nix run github:nix-community/nix-eval-jobs
|
||||||
|
-- --gc-roots-dir gcroot
|
||||||
|
--flake
|
||||||
|
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||||
|
nix-build:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ ubuntu-latest, macos-latest ]
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: Set-up cachix to push the results to
|
||||||
|
uses: cachix/cachix-action@v13
|
||||||
|
with:
|
||||||
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
|
name: llama-cpp
|
||||||
|
- name: Build
|
||||||
|
run: >
|
||||||
|
nix run github:Mic92/nix-fast-build
|
||||||
|
-- --skip-cached --no-nom
|
||||||
|
--flake
|
||||||
|
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
name: update-flake-lock
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lockfile:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@main
|
||||||
|
- name: Update flake.lock
|
||||||
|
uses: DeterminateSystems/update-flake-lock@main
|
||||||
|
with:
|
||||||
|
pr-title: "nix: update flake.lock"
|
||||||
|
pr-labels: |
|
||||||
|
nix
|
||||||
|
pr-reviewers: philiptaron,SomeoneSerge
|
||||||
|
token: ${{ secrets.FLAKE_TOKEN }}
|
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
||||||
|
name: "Publish a flake to flakestry & flakehub"
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- "*"
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
tag:
|
||||||
|
description: "The existing tag to publish"
|
||||||
|
type: "string"
|
||||||
|
required: true
|
||||||
|
jobs:
|
||||||
|
flakestry-publish:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
id-token: "write"
|
||||||
|
contents: "read"
|
||||||
|
steps:
|
||||||
|
- uses: flakestry/flakestry-publish@main
|
||||||
|
with:
|
||||||
|
version: "${{ inputs.tag || github.ref_name }}"
|
||||||
|
flakehub-publish:
|
||||||
|
runs-on: "ubuntu-latest"
|
||||||
|
permissions:
|
||||||
|
id-token: "write"
|
||||||
|
contents: "read"
|
||||||
|
steps:
|
||||||
|
- uses: "actions/checkout@v4"
|
||||||
|
with:
|
||||||
|
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
||||||
|
- uses: "DeterminateSystems/nix-installer-action@main"
|
||||||
|
- uses: "DeterminateSystems/flakehub-push@main"
|
||||||
|
with:
|
||||||
|
visibility: "public"
|
||||||
|
tag: "${{ inputs.tag }}"
|
|
@ -6,13 +6,15 @@ on:
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
- '.github/workflows/python-check-requirements.yml'
|
||||||
- 'scripts/check-requirements.sh'
|
- 'scripts/check-requirements.sh'
|
||||||
- 'convert*.py'
|
- 'convert*.py'
|
||||||
- '**/requirements*.txt'
|
- 'requirements.txt'
|
||||||
|
- 'requirements/*.txt'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
- '.github/workflows/python-check-requirements.yml'
|
||||||
- 'scripts/check-requirements.sh'
|
- 'scripts/check-requirements.sh'
|
||||||
- 'convert*.py'
|
- 'convert*.py'
|
||||||
- '**/requirements*.txt'
|
- 'requirements.txt'
|
||||||
|
- 'requirements/*.txt'
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
|
9
.github/workflows/python-lint.yml
vendored
9
.github/workflows/python-lint.yml
vendored
|
@ -1,13 +1,6 @@
|
||||||
name: flake8 Lint
|
name: flake8 Lint
|
||||||
|
|
||||||
on:
|
on: [push, pull_request]
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
|
4
.github/workflows/python-type-check.yml
vendored
4
.github/workflows/python-type-check.yml
vendored
|
@ -4,13 +4,11 @@ on:
|
||||||
push:
|
push:
|
||||||
paths:
|
paths:
|
||||||
- '.github/workflows/python-type-check.yml'
|
- '.github/workflows/python-type-check.yml'
|
||||||
- 'pyrightconfig.json'
|
|
||||||
- '**.py'
|
- '**.py'
|
||||||
- '**/requirements*.txt'
|
- '**/requirements*.txt'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- '.github/workflows/python-type-check.yml'
|
- '.github/workflows/python-type-check.yml'
|
||||||
- 'pyrightconfig.json'
|
|
||||||
- '**.py'
|
- '**.py'
|
||||||
- '**/requirements*.txt'
|
- '**/requirements*.txt'
|
||||||
|
|
||||||
|
@ -35,6 +33,6 @@ jobs:
|
||||||
- name: Type-check with Pyright
|
- name: Type-check with Pyright
|
||||||
uses: jakebailey/pyright-action@v2
|
uses: jakebailey/pyright-action@v2
|
||||||
with:
|
with:
|
||||||
version: 1.1.382
|
version: 1.1.370
|
||||||
level: warning
|
level: warning
|
||||||
warnings: true
|
warnings: true
|
||||||
|
|
90
.github/workflows/server.yml
vendored
90
.github/workflows/server.yml
vendored
|
@ -20,12 +20,6 @@ on:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
|
|
||||||
env:
|
|
||||||
LLAMA_LOG_COLORS: 1
|
|
||||||
LLAMA_LOG_PREFIX: 1
|
|
||||||
LLAMA_LOG_TIMESTAMPS: 1
|
|
||||||
LLAMA_LOG_VERBOSITY: 10
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
@ -76,49 +70,20 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/server/tests/requirements.txt
|
pip install -r examples/server/tests/requirements.txt
|
||||||
|
|
||||||
# Setup nodejs (to be used for verifying bundled index.html)
|
- name: Verify server deps
|
||||||
- uses: actions/setup-node@v4
|
id: verify_server_deps
|
||||||
with:
|
|
||||||
node-version: '22.11.0'
|
|
||||||
|
|
||||||
- name: WebUI - Install dependencies
|
|
||||||
id: webui_lint
|
|
||||||
run: |
|
|
||||||
cd examples/server/webui
|
|
||||||
npm ci
|
|
||||||
|
|
||||||
- name: WebUI - Check code format
|
|
||||||
id: webui_format
|
|
||||||
run: |
|
run: |
|
||||||
git config --global --add safe.directory $(realpath .)
|
git config --global --add safe.directory $(realpath .)
|
||||||
cd examples/server/webui
|
cd examples/server
|
||||||
|
git ls-files --others --modified
|
||||||
git status
|
git status
|
||||||
|
./deps.sh
|
||||||
npm run format
|
|
||||||
git status
|
git status
|
||||||
modified_files="$(git status -s)"
|
not_ignored_files="$(git ls-files --others --modified)"
|
||||||
echo "Modified files: ${modified_files}"
|
echo "Modified files: ${not_ignored_files}"
|
||||||
if [ -n "${modified_files}" ]; then
|
if [ -n "${not_ignored_files}" ]; then
|
||||||
echo "Files do not follow coding style. To fix: npm run format"
|
echo "Repository is dirty or server deps are not built as expected"
|
||||||
echo "${modified_files}"
|
echo "${not_ignored_files}"
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Verify bundled index.html
|
|
||||||
id: verify_server_index_html
|
|
||||||
run: |
|
|
||||||
git config --global --add safe.directory $(realpath .)
|
|
||||||
cd examples/server/webui
|
|
||||||
git status
|
|
||||||
|
|
||||||
npm run build
|
|
||||||
git status
|
|
||||||
modified_files="$(git status -s)"
|
|
||||||
echo "Modified files: ${modified_files}"
|
|
||||||
if [ -n "${modified_files}" ]; then
|
|
||||||
echo "Repository is dirty or server/webui is not built as expected"
|
|
||||||
echo "Hint: You may need to follow Web UI build guide in server/README.md"
|
|
||||||
echo "${modified_files}"
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -135,9 +100,9 @@ jobs:
|
||||||
-DGGML_OPENMP=OFF ;
|
-DGGML_OPENMP=OFF ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
- name: Build
|
||||||
id: cmake_build_sanitizers
|
id: cmake_build
|
||||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
|
@ -147,37 +112,18 @@ jobs:
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build
|
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
./tests.sh
|
PORT=8888 ./tests.sh
|
||||||
|
|
||||||
- name: Tests (sanitizers)
|
|
||||||
id: server_integration_tests_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
LLAMA_SANITIZE=1 ./tests.sh
|
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
SLOW_TESTS=1 ./tests.sh
|
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
|
@ -227,13 +173,11 @@ jobs:
|
||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
$env:PYTHONIOENCODING = ":replace"
|
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||||
pytest -v -x -m "not slow"
|
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
$env:SLOW_TESTS = "1"
|
behave.exe --stop --no-skipped --no-capture --tags slow
|
||||||
pytest -v -x
|
|
||||||
|
|
15
.gitignore
vendored
15
.gitignore
vendored
|
@ -3,7 +3,6 @@
|
||||||
*.a
|
*.a
|
||||||
*.bat
|
*.bat
|
||||||
*.bin
|
*.bin
|
||||||
*.d
|
|
||||||
*.dll
|
*.dll
|
||||||
*.dot
|
*.dot
|
||||||
*.etag
|
*.etag
|
||||||
|
@ -18,7 +17,6 @@
|
||||||
*.metallib
|
*.metallib
|
||||||
*.o
|
*.o
|
||||||
*.so
|
*.so
|
||||||
*.swp
|
|
||||||
*.tmp
|
*.tmp
|
||||||
|
|
||||||
# IDE / OS
|
# IDE / OS
|
||||||
|
@ -63,7 +61,6 @@ llama-batched-swift
|
||||||
/rpc-server
|
/rpc-server
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
autogen-*.md
|
|
||||||
|
|
||||||
# Deprecated
|
# Deprecated
|
||||||
|
|
||||||
|
@ -82,6 +79,7 @@ models-mnt
|
||||||
!models/ggml-vocab-*.gguf*
|
!models/ggml-vocab-*.gguf*
|
||||||
|
|
||||||
# Zig
|
# Zig
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
|
@ -105,10 +103,6 @@ examples/server/*.mjs.hpp
|
||||||
!examples/sycl/*.bat
|
!examples/sycl/*.bat
|
||||||
!examples/sycl/*.sh
|
!examples/sycl/*.sh
|
||||||
|
|
||||||
# Server Web UI temporary files
|
|
||||||
node_modules
|
|
||||||
examples/server/webui/dist
|
|
||||||
|
|
||||||
# Python
|
# Python
|
||||||
|
|
||||||
/.venv
|
/.venv
|
||||||
|
@ -136,10 +130,3 @@ poetry.toml
|
||||||
|
|
||||||
# Scripts
|
# Scripts
|
||||||
!/scripts/install-oneapi.bat
|
!/scripts/install-oneapi.bat
|
||||||
|
|
||||||
# Test models for lora adapters
|
|
||||||
/lora-tests
|
|
||||||
|
|
||||||
# Local scripts
|
|
||||||
/run-vim.sh
|
|
||||||
/run-chat.sh
|
|
||||||
|
|
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
||||||
[submodule "kompute"]
|
[submodule "kompute"]
|
||||||
path = ggml/src/ggml-kompute/kompute
|
path = ggml/src/kompute
|
||||||
url = https://github.com/nomic-ai/kompute.git
|
url = https://github.com/nomic-ai/kompute.git
|
||||||
|
|
267
AUTHORS
267
AUTHORS
|
@ -1,4 +1,4 @@
|
||||||
# date: Tue Feb 4 13:04:05 EET 2025
|
# date: Wed Jun 26 19:36:34 EEST 2024
|
||||||
# this file is auto-generated by scripts/gen-authors.sh
|
# this file is auto-generated by scripts/gen-authors.sh
|
||||||
|
|
||||||
0cc4m <picard12@live.de>
|
0cc4m <picard12@live.de>
|
||||||
|
@ -7,7 +7,6 @@
|
||||||
2f38b454 <dxf@protonmail.com>
|
2f38b454 <dxf@protonmail.com>
|
||||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||||
44670 <44670@users.noreply.github.com>
|
44670 <44670@users.noreply.github.com>
|
||||||
65a <10104049+65a@users.noreply.github.com>
|
|
||||||
AN Long <aisk@users.noreply.github.com>
|
AN Long <aisk@users.noreply.github.com>
|
||||||
AT <manyoso@users.noreply.github.com>
|
AT <manyoso@users.noreply.github.com>
|
||||||
Aarni Koskela <akx@iki.fi>
|
Aarni Koskela <akx@iki.fi>
|
||||||
|
@ -20,30 +19,20 @@ Adithya Balaji <adithya.b94@gmail.com>
|
||||||
AdithyanI <adithyan.i4internet@gmail.com>
|
AdithyanI <adithyan.i4internet@gmail.com>
|
||||||
Adrian <smith.adriane@gmail.com>
|
Adrian <smith.adriane@gmail.com>
|
||||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||||
Adrien Gallouët <adrien@gallouet.fr>
|
|
||||||
Adrien Gallouët <angt@huggingface.co>
|
|
||||||
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
|
|
||||||
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
||||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||||
AidanBeltonS <aidan.belton@codeplay.com>
|
|
||||||
Aisuko <urakiny@gmail.com>
|
Aisuko <urakiny@gmail.com>
|
||||||
Akarshan Biswas <akarshan.biswas@gmail.com>
|
|
||||||
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
||||||
Al Mochkin <14274697+amochkin@users.noreply.github.com>
|
|
||||||
Albert Jin <albert.jin@gmail.com>
|
Albert Jin <albert.jin@gmail.com>
|
||||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||||
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
|
|
||||||
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
|
|
||||||
Alex <awhill19@icloud.com>
|
Alex <awhill19@icloud.com>
|
||||||
Alex Azarov <alex@azarov.by>
|
Alex Azarov <alex@azarov.by>
|
||||||
Alex Azarov <alexander.azarov@mapbox.com>
|
Alex Azarov <alexander.azarov@mapbox.com>
|
||||||
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||||
Alex Klinkhamer <git@grencez.dev>
|
Alex Klinkhamer <git@grencez.dev>
|
||||||
Alex Nguyen <tiendung@users.noreply.github.com>
|
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||||
Alex O'Connell <35843486+acon96@users.noreply.github.com>
|
|
||||||
Alex Petenchea <alex.petenchea@gmail.com>
|
Alex Petenchea <alex.petenchea@gmail.com>
|
||||||
Alex Renda <alexrenda@users.noreply.github.com>
|
Alex Renda <alexrenda@users.noreply.github.com>
|
||||||
Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
|
|
||||||
Alex von Gluck IV <kallisti5@unixzen.com>
|
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||||
Alexey Parfenov <zxed@alkatrazstudio.net>
|
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||||
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||||
|
@ -56,26 +45,18 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||||
Ananta Bastola <anantarajbastola@gmail.com>
|
Ananta Bastola <anantarajbastola@gmail.com>
|
||||||
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||||
András Salamon <ott2@users.noreply.github.com>
|
András Salamon <ott2@users.noreply.github.com>
|
||||||
Andreas (Andi) Kunar <andreask@msn.com>
|
|
||||||
Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
|
|
||||||
Andrei <abetlen@gmail.com>
|
Andrei <abetlen@gmail.com>
|
||||||
Andrew Canis <andrew.canis@gmail.com>
|
Andrew Canis <andrew.canis@gmail.com>
|
||||||
Andrew Downing <andrew2085@gmail.com>
|
Andrew Downing <andrew2085@gmail.com>
|
||||||
Andrew Duffy <a10y@users.noreply.github.com>
|
Andrew Duffy <a10y@users.noreply.github.com>
|
||||||
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||||
Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
|
|
||||||
Andy Salerno <andysalerno@gmail.com>
|
|
||||||
Andy Tai <andy-tai@users.noreply.github.com>
|
Andy Tai <andy-tai@users.noreply.github.com>
|
||||||
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
|
|
||||||
Antonis Makropoulos <benuix@gmail.com>
|
|
||||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||||
Armen Kaleshian <kriation@users.noreply.github.com>
|
|
||||||
Artem <guinmoon@gmail.com>
|
Artem <guinmoon@gmail.com>
|
||||||
Artem Zinnatullin <ceo@abstractny.gay>
|
Artem Zinnatullin <ceo@abstractny.gay>
|
||||||
Artyom Lebedev <vagran.ast@gmail.com>
|
Artyom Lebedev <vagran.ast@gmail.com>
|
||||||
Asbjørn Olling <asbjornolling@gmail.com>
|
Asbjørn Olling <asbjornolling@gmail.com>
|
||||||
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||||
Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
|
|
||||||
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
||||||
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||||
Ashraful Islam <ashraful.meche@gmail.com>
|
Ashraful Islam <ashraful.meche@gmail.com>
|
||||||
|
@ -94,21 +75,13 @@ Ben Siraphob <bensiraphob@gmail.com>
|
||||||
Ben Williams <ben@719ben.com>
|
Ben Williams <ben@719ben.com>
|
||||||
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
||||||
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||||
Benson Wong <mostlygeek@gmail.com>
|
|
||||||
Bernat Vadell <hounter.caza@gmail.com>
|
Bernat Vadell <hounter.caza@gmail.com>
|
||||||
Bernhard M. Wiedemann <githubbmwprimary@lsmod.de>
|
|
||||||
Bert Wagner <github@bertwagner.com>
|
|
||||||
Billel Mokeddem <billel.mokeddem.ml@gmail.com>
|
|
||||||
Bingan <70050083+binganao@users.noreply.github.com>
|
Bingan <70050083+binganao@users.noreply.github.com>
|
||||||
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
|
|
||||||
Bodo Graumann <mail@bodograumann.de>
|
Bodo Graumann <mail@bodograumann.de>
|
||||||
Bono Lv <lvscar@users.noreply.github.com>
|
Bono Lv <lvscar@users.noreply.github.com>
|
||||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||||
Borislav Stanimirov <b@ibob.bg>
|
|
||||||
Branden Butler <bwtbutler@hotmail.com>
|
Branden Butler <bwtbutler@hotmail.com>
|
||||||
Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
|
|
||||||
Brian <mofosyne@gmail.com>
|
Brian <mofosyne@gmail.com>
|
||||||
Brian Cunnie <brian.cunnie@gmail.com>
|
|
||||||
Bruce MacDonald <brucewmacdonald@gmail.com>
|
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
Bryan Honof <bryanhonof@gmail.com>
|
Bryan Honof <bryanhonof@gmail.com>
|
||||||
CJ Pais <cj@cjpais.com>
|
CJ Pais <cj@cjpais.com>
|
||||||
|
@ -117,51 +90,32 @@ Calvin Laurenson <calvin@laurenson.dev>
|
||||||
Cameron <csteele@steelecameron.com>
|
Cameron <csteele@steelecameron.com>
|
||||||
Cameron Kaiser <classilla@users.noreply.github.com>
|
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||||
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
||||||
CarryFun <76023481+CarryFun@users.noreply.github.com>
|
|
||||||
Carsten Kragelund Jørgensen <carsten@kragelund.me>
|
|
||||||
CarterLi999 <664681047@qq.com>
|
|
||||||
Casey Primozic <casey@cprimozic.net>
|
Casey Primozic <casey@cprimozic.net>
|
||||||
Casey Primozic <me@ameo.link>
|
Casey Primozic <me@ameo.link>
|
||||||
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||||
Cebtenzzre <cebtenzzre@gmail.com>
|
Cebtenzzre <cebtenzzre@gmail.com>
|
||||||
CentricStorm <CentricStorm@users.noreply.github.com>
|
|
||||||
Chad Brewbaker <crb002@gmail.com>
|
Chad Brewbaker <crb002@gmail.com>
|
||||||
Changyeon Kim <cyzero.kim@samsung.com>
|
|
||||||
Chao Jiang <jc19chaoj@zoho.com>
|
Chao Jiang <jc19chaoj@zoho.com>
|
||||||
Charles Xu <63788048+chaxu01@users.noreply.github.com>
|
|
||||||
Charles Xu <charles.xu@arm.com>
|
|
||||||
Chen Xi <xi2.chen@intel.com>
|
|
||||||
Chen Xi <xixichen08@foxmail.com>
|
|
||||||
Cheng Shao <terrorjack@type.dance>
|
Cheng Shao <terrorjack@type.dance>
|
||||||
Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
|
|
||||||
Chris Elrod <elrodc@gmail.com>
|
Chris Elrod <elrodc@gmail.com>
|
||||||
Chris Kuehl <ckuehl@ckuehl.me>
|
Chris Kuehl <ckuehl@ckuehl.me>
|
||||||
Christian Demsar <christian@github.email.demsar.us>
|
Christian Demsar <christian@github.email.demsar.us>
|
||||||
Christian Demsar <crasm@git.vczf.us>
|
Christian Demsar <crasm@git.vczf.us>
|
||||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||||
Christian Kastner <ckk@kvr.at>
|
|
||||||
Christian Kögler <ck3d@gmx.de>
|
Christian Kögler <ck3d@gmx.de>
|
||||||
Christian Köhnenkamp <cvk5@me.com>
|
|
||||||
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
||||||
Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
|
|
||||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||||
Clint Herron <hanclinto@gmail.com>
|
Clint Herron <hanclinto@gmail.com>
|
||||||
Conrad Kramer <conrad@conradkramer.com>
|
|
||||||
Corentin REGAL <corentin.regal@gmail.com>
|
|
||||||
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
||||||
Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
|
|
||||||
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||||
DAN™ <dranger003@gmail.com>
|
DAN™ <dranger003@gmail.com>
|
||||||
Damian Stewart <d@damianstewart.com>
|
Damian Stewart <d@damianstewart.com>
|
||||||
Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
|
|
||||||
Dan Johansson <dan.johansson@arm.com>
|
|
||||||
Dane Madsen <dane_madsen@hotmail.com>
|
Dane Madsen <dane_madsen@hotmail.com>
|
||||||
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||||
Daniel Bevenius <daniel.bevenius@gmail.com>
|
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||||
Daniel Drake <drake@endlessos.org>
|
Daniel Drake <drake@endlessos.org>
|
||||||
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||||
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
|
|
||||||
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
||||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||||
|
@ -175,29 +129,19 @@ David Pflug <david@pflug.email>
|
||||||
David Renshaw <dwrenshaw@gmail.com>
|
David Renshaw <dwrenshaw@gmail.com>
|
||||||
David Sommers <12738+databyte@users.noreply.github.com>
|
David Sommers <12738+databyte@users.noreply.github.com>
|
||||||
David Yang <davidyang6us@gmail.com>
|
David Yang <davidyang6us@gmail.com>
|
||||||
DavidKorczynski <david@adalogics.com>
|
|
||||||
Dawid Potocki <github@dawidpotocki.com>
|
Dawid Potocki <github@dawidpotocki.com>
|
||||||
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||||
Dean <Dean.Sinaean@gmail.com>
|
Dean <Dean.Sinaean@gmail.com>
|
||||||
Deins <deinsegle@gmail.com>
|
Deins <deinsegle@gmail.com>
|
||||||
Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
|
|
||||||
Derrick T. Woolworth <dwoolworth@gmail.com>
|
|
||||||
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
||||||
Dibakar Gope <dibakar.gope@arm.com>
|
|
||||||
Didzis Gosko <didzis@users.noreply.github.com>
|
Didzis Gosko <didzis@users.noreply.github.com>
|
||||||
Diego Devesa <slarengh@gmail.com>
|
|
||||||
Diogo Teles Sant'Anna <diogoteles@google.com>
|
|
||||||
Djip007 <3705339+Djip007@users.noreply.github.com>
|
|
||||||
Djip007 <djip.perois@free.fr>
|
Djip007 <djip.perois@free.fr>
|
||||||
Don Mahurin <dmahurin@users.noreply.github.com>
|
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||||
DooWoong Lee (David) <manics99@naver.com>
|
DooWoong Lee (David) <manics99@naver.com>
|
||||||
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||||
Dou Xinpeng <15529241576@163.com>
|
|
||||||
Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
|
|
||||||
Douglas Hanley <thesecretaryofwar@gmail.com>
|
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||||
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||||
Ebey Abraham <ebey97@gmail.com>
|
Ebey Abraham <ebey97@gmail.com>
|
||||||
Echo Nolan <echo@echonolan.net>
|
|
||||||
Ed Lee <edilee@mozilla.com>
|
Ed Lee <edilee@mozilla.com>
|
||||||
Ed Lepedus <ed.lepedus@googlemail.com>
|
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||||
Eddie-Wang <wangjinheng1120@163.com>
|
Eddie-Wang <wangjinheng1120@163.com>
|
||||||
|
@ -205,16 +149,12 @@ Edward Taylor <edeetee@gmail.com>
|
||||||
Elaine <elaine.zosa@gmail.com>
|
Elaine <elaine.zosa@gmail.com>
|
||||||
Elbios <141279586+Elbios@users.noreply.github.com>
|
Elbios <141279586+Elbios@users.noreply.github.com>
|
||||||
Elton Kola <eltonkola@gmail.com>
|
Elton Kola <eltonkola@gmail.com>
|
||||||
Emreerdog <34742675+Emreerdog@users.noreply.github.com>
|
|
||||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||||
Equim <sayaka@ekyu.moe>
|
Equim <sayaka@ekyu.moe>
|
||||||
Eric Curtin <ecurtin@redhat.com>
|
|
||||||
Eric Curtin <ericcurtin17@gmail.com>
|
|
||||||
Eric Sommerlade <es0m@users.noreply.github.com>
|
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||||
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||||
Erik Garrison <erik.garrison@gmail.com>
|
Erik Garrison <erik.garrison@gmail.com>
|
||||||
Erik Scholz <Green-Sky@users.noreply.github.com>
|
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||||
Esko Toivonen <eskot98@gmail.com>
|
|
||||||
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||||
Evan Jones <evan.q.jones@gmail.com>
|
Evan Jones <evan.q.jones@gmail.com>
|
||||||
Evan Miller <emmiller@gmail.com>
|
Evan Miller <emmiller@gmail.com>
|
||||||
|
@ -226,27 +166,19 @@ FK <sozforex@gmail.com>
|
||||||
Fabian <cmdrf@users.noreply.github.com>
|
Fabian <cmdrf@users.noreply.github.com>
|
||||||
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||||
Faez Shakil <faez.shakil@gmail.com>
|
Faez Shakil <faez.shakil@gmail.com>
|
||||||
Faisal Zaghloul <faisal.zaghloul@gmail.com>
|
|
||||||
Faisal Zaghloul <quic_fzaghlou@quicinc.com>
|
|
||||||
Fan Shupei <dymarkfan@outlook.com>
|
|
||||||
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||||
Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
|
|
||||||
Fattire <528174+fat-tire@users.noreply.github.com>
|
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||||
Felix <stenbackfelix@gmail.com>
|
Felix <stenbackfelix@gmail.com>
|
||||||
Finn Voorhees <finnvoorhees@gmail.com>
|
Finn Voorhees <finnvoorhees@gmail.com>
|
||||||
Firat <firatkiral@gmail.com>
|
Firat <firatkiral@gmail.com>
|
||||||
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
|
|
||||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||||
Frank Mai <thxcode0824@gmail.com>
|
Frank Mai <thxcode0824@gmail.com>
|
||||||
FrankHB <frankhb1989@gmail.com>
|
FrankHB <frankhb1989@gmail.com>
|
||||||
Frankie Robertson <frankier@users.noreply.github.com>
|
|
||||||
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
||||||
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||||
Gabe Goodhart <gabe.l.hart@gmail.com>
|
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||||
Gabe Goodhart <ghart@us.ibm.com>
|
|
||||||
Gaetan Bisson <gaetan@fenua.org>
|
|
||||||
GainLee <perfecter.gen@gmail.com>
|
GainLee <perfecter.gen@gmail.com>
|
||||||
Galunid <karolek1231456@gmail.com>
|
Galunid <karolek1231456@gmail.com>
|
||||||
Gary Linscott <glinscott@gmail.com>
|
Gary Linscott <glinscott@gmail.com>
|
||||||
|
@ -255,15 +187,12 @@ Gavin Zhao <gavinzhaojw@protonmail.com>
|
||||||
Genkagaku.GPT <hlhr202@163.com>
|
Genkagaku.GPT <hlhr202@163.com>
|
||||||
Georgi Gerganov <ggerganov@gmail.com>
|
Georgi Gerganov <ggerganov@gmail.com>
|
||||||
Gilad S <giladgd@users.noreply.github.com>
|
Gilad S <giladgd@users.noreply.github.com>
|
||||||
Gilad S. <7817232+giladgd@users.noreply.github.com>
|
|
||||||
Giuseppe Scrivano <giuseppe@scrivano.org>
|
Giuseppe Scrivano <giuseppe@scrivano.org>
|
||||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||||
Govlzkoy <gotope@users.noreply.github.com>
|
Govlzkoy <gotope@users.noreply.github.com>
|
||||||
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||||
Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
|
|
||||||
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||||
Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
|
|
||||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||||
Haggai Nuchi <h.nuchi@gmail.com>
|
Haggai Nuchi <h.nuchi@gmail.com>
|
||||||
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||||
|
@ -274,47 +203,35 @@ Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||||
Harald Fernengel <harald.fernengel@here.com>
|
Harald Fernengel <harald.fernengel@here.com>
|
||||||
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
||||||
HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
|
HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
|
||||||
Haus1 <haus.xda@gmail.com>
|
|
||||||
Henk Poley <HenkPoley@gmail.com>
|
Henk Poley <HenkPoley@gmail.com>
|
||||||
Henri Vasserman <henv@hot.ee>
|
Henri Vasserman <henv@hot.ee>
|
||||||
Henrik Forstén <henrik.forsten@gmail.com>
|
Henrik Forstén <henrik.forsten@gmail.com>
|
||||||
Herman Semenov <GermanAizek@yandex.ru>
|
Herman Semenov <GermanAizek@yandex.ru>
|
||||||
Hesen Peng <hesen.peng@gmail.com>
|
Hesen Peng <hesen.peng@gmail.com>
|
||||||
HimariO <dsfhe49854@gmail.com>
|
|
||||||
Hoang Nguyen <hugo53@users.noreply.github.com>
|
Hoang Nguyen <hugo53@users.noreply.github.com>
|
||||||
Hong Bo PENG <penghb@cn.ibm.com>
|
Hong Bo PENG <penghb@cn.ibm.com>
|
||||||
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||||
Howard Su <howard0su@gmail.com>
|
Howard Su <howard0su@gmail.com>
|
||||||
Hua Jiang <allenhjiang@outlook.com>
|
Hua Jiang <allenhjiang@outlook.com>
|
||||||
Huang Qi <huangqi3@xiaomi.com>
|
|
||||||
Huawei Lin <huaweilin.cs@gmail.com>
|
Huawei Lin <huaweilin.cs@gmail.com>
|
||||||
Hugo Roussel <hugo.rous@gmail.com>
|
Hugo Roussel <hugo.rous@gmail.com>
|
||||||
Huifeng Ou <79071290+ho2103@users.noreply.github.com>
|
|
||||||
Ian Bull <irbull@eclipsesource.com>
|
Ian Bull <irbull@eclipsesource.com>
|
||||||
Ian Bull <irbull@gmail.com>
|
Ian Bull <irbull@gmail.com>
|
||||||
Ian Scrivener <github@zilogy.asia>
|
Ian Scrivener <github@zilogy.asia>
|
||||||
Icecream95 <the.real.icecream95@gmail.com>
|
|
||||||
Ido S <ido.pluto@gmail.com>
|
Ido S <ido.pluto@gmail.com>
|
||||||
IgnacioFDM <ignaciofdm@gmail.com>
|
IgnacioFDM <ignaciofdm@gmail.com>
|
||||||
Igor Okulist <okigan@gmail.com>
|
Igor Okulist <okigan@gmail.com>
|
||||||
Ihar Hrachyshka <ihrachys@redhat.com>
|
|
||||||
Ikko Eltociear Ashimine <eltociear@gmail.com>
|
Ikko Eltociear Ashimine <eltociear@gmail.com>
|
||||||
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||||
Ionoclast Laboratories <brigham@ionoclast.com>
|
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||||
Isaac McFadyen <isaac@imcf.me>
|
Isaac McFadyen <isaac@imcf.me>
|
||||||
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||||
Ivan <nekotekina@gmail.com>
|
|
||||||
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
|
|
||||||
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||||
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||||
JFLFY2255 <JFLFY2255@163.com>
|
|
||||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||||
Jack Mousseau <jack@software.inc>
|
|
||||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||||
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||||
Jaeden Amero <jaeden@patater.com>
|
|
||||||
Jaemin Son <woalsdnd@gmail.com>
|
Jaemin Son <woalsdnd@gmail.com>
|
||||||
Jafar Uruç <jafar.uruc@gmail.com>
|
|
||||||
Jag Chadha <jagtesh@gmail.com>
|
Jag Chadha <jagtesh@gmail.com>
|
||||||
Jakub N <jakubniemczyk97@gmail.com>
|
Jakub N <jakubniemczyk97@gmail.com>
|
||||||
James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
|
James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
|
||||||
|
@ -326,16 +243,11 @@ Jannis Schönleber <joennlae@gmail.com>
|
||||||
Jared Van Bortel <cebtenzzre@gmail.com>
|
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||||
Jared Van Bortel <jared@nomic.ai>
|
Jared Van Bortel <jared@nomic.ai>
|
||||||
Jason McCartney <jmac@theroot.org>
|
Jason McCartney <jmac@theroot.org>
|
||||||
Jason Stillerman <jason.t.stillerman@gmail.com>
|
|
||||||
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||||
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||||
Jed Fox <git@jedfox.com>
|
Jed Fox <git@jedfox.com>
|
||||||
Jeff Bolz <jbolz@nvidia.com>
|
|
||||||
Jeffrey Morgan <jmorganca@gmail.com>
|
|
||||||
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||||
Jeroen Mostert <jeroen.mostert@cm.com>
|
|
||||||
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||||
Jett Janiak <jettjaniak@gmail.com>
|
|
||||||
Jeximo <jeximo@gmail.com>
|
Jeximo <jeximo@gmail.com>
|
||||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||||
Jiahao Li <liplus17@163.com>
|
Jiahao Li <liplus17@163.com>
|
||||||
|
@ -346,9 +258,6 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||||
Jiří Sejkora <Sejseloid@gmail.com>
|
Jiří Sejkora <Sejseloid@gmail.com>
|
||||||
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
||||||
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
||||||
João Dinis Ferreira <hello@joaof.eu>
|
|
||||||
Joe Eli McIlvain <joe.eli.mac@gmail.com>
|
|
||||||
Joe Todd <joe.todd@codeplay.com>
|
|
||||||
Johan <JohanAR@users.noreply.github.com>
|
Johan <JohanAR@users.noreply.github.com>
|
||||||
Johannes Gäßler <johannesg@5d6.de>
|
Johannes Gäßler <johannesg@5d6.de>
|
||||||
Johannes Rudolph <johannes.rudolph@gmail.com>
|
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||||
|
@ -364,11 +273,8 @@ Josh Ramer <josh.ramer@icloud.com>
|
||||||
Joyce <joycebrum@google.com>
|
Joyce <joycebrum@google.com>
|
||||||
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||||
Judd <foldl@users.noreply.github.com>
|
Judd <foldl@users.noreply.github.com>
|
||||||
Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
|
|
||||||
Julius Arkenberg <arki05@users.noreply.github.com>
|
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||||
Jun Hee Yoo <contact.jhyoo@gmail.com>
|
|
||||||
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||||
Junil Kim <logyourself@gmail.com>
|
|
||||||
Junyang Lin <justinlin930319@hotmail.com>
|
Junyang Lin <justinlin930319@hotmail.com>
|
||||||
Juraj Bednar <juraj@bednar.io>
|
Juraj Bednar <juraj@bednar.io>
|
||||||
Justin Parker <jparkerweb@gmail.com>
|
Justin Parker <jparkerweb@gmail.com>
|
||||||
|
@ -379,7 +285,6 @@ Justine Tunney <jtunney@mozilla.com>
|
||||||
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||||
KASR <karim.asrih@gmail.com>
|
KASR <karim.asrih@gmail.com>
|
||||||
Kamil Tomšík <info@tomsik.cz>
|
Kamil Tomšík <info@tomsik.cz>
|
||||||
Karol Kontny <82021046+kkontny@users.noreply.github.com>
|
|
||||||
Karsten Weiss <knweiss@gmail.com>
|
Karsten Weiss <knweiss@gmail.com>
|
||||||
Karthick <j.karthic2004@gmail.com>
|
Karthick <j.karthic2004@gmail.com>
|
||||||
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
|
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
|
||||||
|
@ -387,19 +292,16 @@ Karthik Sethuraman <k.seth1993@gmail.com>
|
||||||
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||||
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||||
Keiichi Tabata <keiichi.tabata@outlook.com>
|
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||||
Keke Han <hankeke303@163.com>
|
|
||||||
Kenvix ⭐ <kenvixzure@live.com>
|
Kenvix ⭐ <kenvixzure@live.com>
|
||||||
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||||
Kevin Gibbons <bakkot@gmail.com>
|
Kevin Gibbons <bakkot@gmail.com>
|
||||||
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||||
Kevin Kwok <antimatter15@gmail.com>
|
Kevin Kwok <antimatter15@gmail.com>
|
||||||
Kevin Lo <kevlo@kevlo.org>
|
Kevin Lo <kevlo@kevlo.org>
|
||||||
Kevin Wang <kevmo314@gmail.com>
|
|
||||||
Kolen Cheung <ickc@users.noreply.github.com>
|
Kolen Cheung <ickc@users.noreply.github.com>
|
||||||
Konstantin Herud <konstantin.herud@denkbares.com>
|
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||||
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||||
Kunshang Ji <kunshang.ji@intel.com>
|
Kunshang Ji <kunshang.ji@intel.com>
|
||||||
Kyle Bruene <KyleBruene@users.noreply.github.com>
|
|
||||||
Kyle Liang <liangmanlai@gmail.com>
|
Kyle Liang <liangmanlai@gmail.com>
|
||||||
Kyle Mistele <kyle@mistele.com>
|
Kyle Mistele <kyle@mistele.com>
|
||||||
Kylin <56434533+KyL0N@users.noreply.github.com>
|
Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||||
|
@ -413,30 +315,22 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||||
Leonardo Neumann <leonardo@neumann.dev.br>
|
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||||
Li Tan <tanliboy@gmail.com>
|
Li Tan <tanliboy@gmail.com>
|
||||||
Linwei Wang <wanix1988@gmail.com>
|
Linwei Wang <wanix1988@gmail.com>
|
||||||
Liu Jia <109258120+Septa2112@users.noreply.github.com>
|
|
||||||
Liu Jia <jia3.liu@intel.com>
|
|
||||||
LoganDark <github@logandark.mozmail.com>
|
LoganDark <github@logandark.mozmail.com>
|
||||||
Loïc Carrère <loic.carrere@gmail.com>
|
|
||||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||||
LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
|
|
||||||
Luciano <lucianostrika44@gmail.com>
|
Luciano <lucianostrika44@gmail.com>
|
||||||
Luo Tian <lt@basecity.com>
|
Luo Tian <lt@basecity.com>
|
||||||
Lyle Dean <dean@lyle.dev>
|
Lyle Dean <dean@lyle.dev>
|
||||||
M-A <maruel@gmail.com>
|
|
||||||
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||||
Ma Mingfei <mingfei.ma@intel.com>
|
|
||||||
Maarten ter Huurne <maarten@treewalker.org>
|
Maarten ter Huurne <maarten@treewalker.org>
|
||||||
Mack Straight <eiz@users.noreply.github.com>
|
Mack Straight <eiz@users.noreply.github.com>
|
||||||
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||||
MaggotHATE <clay1326@gmail.com>
|
MaggotHATE <clay1326@gmail.com>
|
||||||
Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
|
|
||||||
Manuel <44313466+makuche@users.noreply.github.com>
|
Manuel <44313466+makuche@users.noreply.github.com>
|
||||||
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||||
Marco Matthies <71844+marcom@users.noreply.github.com>
|
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||||
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||||
Marian Cepok <marian.cepok@gmail.com>
|
Marian Cepok <marian.cepok@gmail.com>
|
||||||
Mark Fairbairn <thebaron88@gmail.com>
|
Mark Fairbairn <thebaron88@gmail.com>
|
||||||
Mark Zhuang <zhuangqiubin@gmail.com>
|
|
||||||
Marko Tasic <mtasic85@gmail.com>
|
Marko Tasic <mtasic85@gmail.com>
|
||||||
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
||||||
Martin Delille <martin@delille.org>
|
Martin Delille <martin@delille.org>
|
||||||
|
@ -448,16 +342,11 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
|
||||||
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||||
Matheus C. França <matheus-catarino@hotmail.com>
|
Matheus C. França <matheus-catarino@hotmail.com>
|
||||||
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||||
Mathieu Baudier <mbaudier@argeo.org>
|
|
||||||
Mathieu Geli <mathieu.geli@gmail.com>
|
|
||||||
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||||
Mathijs Henquet <mathijs.henquet@gmail.com>
|
|
||||||
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||||
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||||
Matt Pulver <matt.pulver@heavy.ai>
|
Matt Pulver <matt.pulver@heavy.ai>
|
||||||
Matt Stephenson <mstephenson6@users.noreply.github.com>
|
|
||||||
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||||
Matteo Mortari <matteo.mortari@gmail.com>
|
|
||||||
Mattheus Chediak <shammcity00@gmail.com>
|
Mattheus Chediak <shammcity00@gmail.com>
|
||||||
Matthew Tejo <matthew.tejo@gmail.com>
|
Matthew Tejo <matthew.tejo@gmail.com>
|
||||||
Matvey Soloviev <blackhole89@gmail.com>
|
Matvey Soloviev <blackhole89@gmail.com>
|
||||||
|
@ -467,11 +356,8 @@ Maxime <672982+maximegmd@users.noreply.github.com>
|
||||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||||
Meng Zhang <meng@tabbyml.com>
|
Meng Zhang <meng@tabbyml.com>
|
||||||
Meng, Hengyu <hengyu.meng@intel.com>
|
Meng, Hengyu <hengyu.meng@intel.com>
|
||||||
Mengqing Cao <cmq0113@163.com>
|
|
||||||
Merrick Christensen <merrick.christensen@gmail.com>
|
Merrick Christensen <merrick.christensen@gmail.com>
|
||||||
Michael Coppola <m18coppola@gmail.com>
|
Michael Coppola <m18coppola@gmail.com>
|
||||||
Michael Engel <mengel@redhat.com>
|
|
||||||
Michael Francis <edude03@gmail.com>
|
|
||||||
Michael Hueschen <m@mhueschen.dev>
|
Michael Hueschen <m@mhueschen.dev>
|
||||||
Michael Kesper <mkesper@schokokeks.org>
|
Michael Kesper <mkesper@schokokeks.org>
|
||||||
Michael Klimenko <mklimenko29@gmail.com>
|
Michael Klimenko <mklimenko29@gmail.com>
|
||||||
|
@ -479,81 +365,52 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||||
Michael Potter <NanoTekGuy@Gmail.com>
|
Michael Potter <NanoTekGuy@Gmail.com>
|
||||||
Michael de Gans <michael.john.degans@gmail.com>
|
Michael de Gans <michael.john.degans@gmail.com>
|
||||||
Michaël de Vries <vriesdemichael@gmail.com>
|
Michaël de Vries <vriesdemichael@gmail.com>
|
||||||
Michał Moskal <michal@moskal.me>
|
|
||||||
Michał Tuszyński <srgtuszy@gmail.com>
|
|
||||||
Michelle Tan <41475767+MichelleTanPY@users.noreply.github.com>
|
|
||||||
Mihai <mihai.chirculescu@yahoo.com>
|
Mihai <mihai.chirculescu@yahoo.com>
|
||||||
Mike <ytianhui2004@gmail.com>
|
Mike <ytianhui2004@gmail.com>
|
||||||
Mikko Juola <mikjuo@gmail.com>
|
Mikko Juola <mikjuo@gmail.com>
|
||||||
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||||
Minsoo Cheong <icycle0409@snu.ac.kr>
|
|
||||||
Mirko185 <mirkosig@gmail.com>
|
Mirko185 <mirkosig@gmail.com>
|
||||||
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||||
MistApproach <98988043+MistApproach@users.noreply.github.com>
|
|
||||||
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||||
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
||||||
Molly Sophia <mollysophia379@gmail.com>
|
|
||||||
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
|
|
||||||
Murilo Santana <mvrilo@gmail.com>
|
Murilo Santana <mvrilo@gmail.com>
|
||||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||||
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||||
Nathan Epstein <nate2@umbc.edu>
|
Nathan Epstein <nate2@umbc.edu>
|
||||||
Natsu <chino@hotococoa.moe>
|
|
||||||
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||||
Nebula <infinitewormhole@gmail.com>
|
Nebula <infinitewormhole@gmail.com>
|
||||||
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
||||||
Neo Zhang <zhang.jianyu@outlook.com>
|
Neo Zhang <zhang.jianyu@outlook.com>
|
||||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||||
Neuman Vong <neuman.vong@gmail.com>
|
Neuman Vong <neuman.vong@gmail.com>
|
||||||
NeverLucky <92274250+nvrxq@users.noreply.github.com>
|
|
||||||
Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
|
|
||||||
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||||
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||||
Nicholai Tukanov <nicholaitukanov@gmail.com>
|
|
||||||
Nico Bosshard <nico@bosshome.ch>
|
|
||||||
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||||
Nicolás Pérez <nicolas_perez@brown.edu>
|
Nicolás Pérez <nicolas_perez@brown.edu>
|
||||||
Nicolò Scipione <nicolo.scipione@codeplay.com>
|
|
||||||
Nigel Bosch <pnigelb@gmail.com>
|
Nigel Bosch <pnigelb@gmail.com>
|
||||||
Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
|
|
||||||
Niklas Korz <niklas@niklaskorz.de>
|
Niklas Korz <niklas@niklaskorz.de>
|
||||||
NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
|
|
||||||
Nikolaos Pothitos <pothitos@di.uoa.gr>
|
|
||||||
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
||||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||||
Nuno <rare-magma@posteo.eu>
|
|
||||||
OSecret <135510162+OLSecret@users.noreply.github.com>
|
|
||||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||||
Ondřej Čertík <ondrej@certik.us>
|
Ondřej Čertík <ondrej@certik.us>
|
||||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||||
PAB <pierreantoine.bannier@gmail.com>
|
|
||||||
Pablo Duboue <pablo.duboue@gmail.com>
|
|
||||||
Pascal Patry <ppatry@mtacitlabs.com>
|
|
||||||
Patrice Ferlet <metal3d@gmail.com>
|
Patrice Ferlet <metal3d@gmail.com>
|
||||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||||
Pavel Zloi <github.com@drteam.rocks>
|
|
||||||
Pavol Rusnak <pavol@rusnak.io>
|
Pavol Rusnak <pavol@rusnak.io>
|
||||||
Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
|
|
||||||
Pedro Cuenca <pedro@huggingface.co>
|
Pedro Cuenca <pedro@huggingface.co>
|
||||||
Peter <peter277@users.noreply.github.com>
|
|
||||||
Peter Sugihara <peter@campsh.com>
|
Peter Sugihara <peter@campsh.com>
|
||||||
Phil H <5756783+phiharri@users.noreply.github.com>
|
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||||
Philip Taron <philip.taron@gmail.com>
|
Philip Taron <philip.taron@gmail.com>
|
||||||
Phillip Kravtsov <phillip@kravtsov.net>
|
Phillip Kravtsov <phillip@kravtsov.net>
|
||||||
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||||
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||||
Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
|
|
||||||
Plamen Minev <pacominev@gmail.com>
|
|
||||||
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
|
|
||||||
Przemysław Pawełczyk <przemoc@gmail.com>
|
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||||
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||||
Qingyou Meng <meng.qingyou@gmail.com>
|
Qingyou Meng <meng.qingyou@gmail.com>
|
||||||
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||||
R0CKSTAR <xiaodong.ye@mthreads.com>
|
|
||||||
R0CKSTAR <yeahdongcn@gmail.com>
|
|
||||||
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||||
Radoslav Gerganov <rgerganov@gmail.com>
|
Radoslav Gerganov <rgerganov@gmail.com>
|
||||||
Radosław Gryta <radek.gryta@gmail.com>
|
Radosław Gryta <radek.gryta@gmail.com>
|
||||||
|
@ -562,16 +419,11 @@ Raj Hammeer Singh Hada <hammeerraj@gmail.com>
|
||||||
Ralph Soika <ralph.soika@imixs.com>
|
Ralph Soika <ralph.soika@imixs.com>
|
||||||
Rand Xie <randxiexyy29@gmail.com>
|
Rand Xie <randxiexyy29@gmail.com>
|
||||||
Randall Fitzgerald <randall@dasaku.net>
|
Randall Fitzgerald <randall@dasaku.net>
|
||||||
Random Fly <renfei8@live.cn>
|
|
||||||
Reinforce-II <fate@eastal.com>
|
Reinforce-II <fate@eastal.com>
|
||||||
Rémy Oudompheng <oudomphe@phare.normalesup.org>
|
|
||||||
Ren Xuancheng <jklj077@users.noreply.github.com>
|
Ren Xuancheng <jklj077@users.noreply.github.com>
|
||||||
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
||||||
Reza Kakhki <rezakakhki.de@gmail.com>
|
|
||||||
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
||||||
Riccardo Orlando <Riccorl@users.noreply.github.com>
|
|
||||||
Riceball LEE <snowyu.lee@gmail.com>
|
Riceball LEE <snowyu.lee@gmail.com>
|
||||||
Rich Dougherty <rich@rd.nz>
|
|
||||||
Richard Kiss <him@richardkiss.com>
|
Richard Kiss <him@richardkiss.com>
|
||||||
Richard Roberson <richardr1126@gmail.com>
|
Richard Roberson <richardr1126@gmail.com>
|
||||||
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||||
|
@ -582,39 +434,26 @@ Riley Stewart <ristew@users.noreply.github.com>
|
||||||
Rinne <AsakusaRinne@gmail.com>
|
Rinne <AsakusaRinne@gmail.com>
|
||||||
Rinne <liu_yaohui1998@126.com>
|
Rinne <liu_yaohui1998@126.com>
|
||||||
Robert Brisita <986796+rbrisita@users.noreply.github.com>
|
Robert Brisita <986796+rbrisita@users.noreply.github.com>
|
||||||
Robert Collins <roberto.tomas.cuentas@gmail.com>
|
|
||||||
Robert Ormandi <52251610+ormandi@users.noreply.github.com>
|
|
||||||
Robert Sung-wook Shin <edp1096@users.noreply.github.com>
|
Robert Sung-wook Shin <edp1096@users.noreply.github.com>
|
||||||
Robey Holderith <robey@flaminglunchbox.net>
|
Robey Holderith <robey@flaminglunchbox.net>
|
||||||
Robyn <robyngraf@users.noreply.github.com>
|
Robyn <robyngraf@users.noreply.github.com>
|
||||||
Roger Meier <r.meier@siemens.com>
|
Roger Meier <r.meier@siemens.com>
|
||||||
Roland <14355895+rbur0425@users.noreply.github.com>
|
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||||
Romain Biessy <romain.biessy@codeplay.com>
|
|
||||||
Romain D <90720+Artefact2@users.noreply.github.com>
|
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||||
Romain Neutron <romain@neutron.io>
|
Romain Neutron <romain@neutron.io>
|
||||||
Roman Parykin <donderom@gmail.com>
|
Roman Parykin <donderom@gmail.com>
|
||||||
Ron Evans <ron@hybridgroup.com>
|
Ron Evans <ron@hybridgroup.com>
|
||||||
Ron Jailall <rojailal@gmail.com>
|
Ron Jailall <rojailal@gmail.com>
|
||||||
Roni <sulpher@gmx.net>
|
|
||||||
Ronny Brendel <ronnybrendel@gmail.com>
|
Ronny Brendel <ronnybrendel@gmail.com>
|
||||||
Ronsor <ronsor@ronsor.pw>
|
Ronsor <ronsor@ronsor.pw>
|
||||||
Rowan Hart <rowanbhart@gmail.com>
|
Rowan Hart <rowanbhart@gmail.com>
|
||||||
Ruan <47767371+ruanych@users.noreply.github.com>
|
|
||||||
Ruchira Hasaranga <ruchira66@gmail.com>
|
|
||||||
Rudi Servo <rudiservo@gmail.com>
|
|
||||||
Ruixin Huang <18860020911@163.com>
|
|
||||||
Rune <43761327+Rune-AI@users.noreply.github.com>
|
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||||
RunningLeon <maningsheng@sensetime.com>
|
|
||||||
RunningLeon <mnsheng@yeah.net>
|
|
||||||
Ryan Landay <rlanday@gmail.com>
|
Ryan Landay <rlanday@gmail.com>
|
||||||
Ryder Wishart <ryderwishart@gmail.com>
|
Ryder Wishart <ryderwishart@gmail.com>
|
||||||
Ryuei <louixs@users.noreply.github.com>
|
Ryuei <louixs@users.noreply.github.com>
|
||||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||||
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
|
|
||||||
SXX <sxx1136965276@gmail.com>
|
|
||||||
SakuraUmi <yukinon244@gmail.com>
|
SakuraUmi <yukinon244@gmail.com>
|
||||||
Salvador E. Tropea <stropea@inti.gob.ar>
|
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||||
Salvatore Mesoraca <s.mesoraca16@gmail.com>
|
|
||||||
Sam Spilsbury <smspillaz@gmail.com>
|
Sam Spilsbury <smspillaz@gmail.com>
|
||||||
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||||
Samuel Maynard <samwmaynard@gmail.com>
|
Samuel Maynard <samwmaynard@gmail.com>
|
||||||
|
@ -624,29 +463,23 @@ Sebastián A <sebastian.aedo29@gmail.com>
|
||||||
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||||
Senemu <10880819+Senemu@users.noreply.github.com>
|
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||||
Sergey Alirzaev <zl29ah@gmail.com>
|
Sergey Alirzaev <zl29ah@gmail.com>
|
||||||
Sergio López <slp@redhat.com>
|
|
||||||
Sergio López <slp@sinrega.org>
|
Sergio López <slp@sinrega.org>
|
||||||
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
||||||
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||||
ShadovvBeast <ShadovvBeast@gmail.com>
|
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||||
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||||
Shane A <shanea@allenai.org>
|
|
||||||
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||||
Shankar <gshankar.87@gmail.com>
|
|
||||||
Shanshan Shen <467638484@qq.com>
|
|
||||||
Shijie <821898965@qq.com>
|
Shijie <821898965@qq.com>
|
||||||
Shintarou Okada <kokuzen@gmail.com>
|
Shintarou Okada <kokuzen@gmail.com>
|
||||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||||
Shouzheng Liu <lshzh.hi@gmail.com>
|
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
||||||
Shupei Fan <dymarkfan@outlook.com>
|
|
||||||
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||||
Simon Willison <swillison@gmail.com>
|
Simon Willison <swillison@gmail.com>
|
||||||
Siwen Yu <yusiwen@gmail.com>
|
Siwen Yu <yusiwen@gmail.com>
|
||||||
Sky Yan <skyan83@gmail.com>
|
Sky Yan <skyan83@gmail.com>
|
||||||
Slaren <2141330+slaren@users.noreply.github.com>
|
Slaren <2141330+slaren@users.noreply.github.com>
|
||||||
Slava Primenko <primenko.s@gmail.com>
|
Slava Primenko <primenko.s@gmail.com>
|
||||||
Small Grass Forest <zixuanxcl@gmail.com>
|
|
||||||
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||||
Someone <sergei.kozlukov@aalto.fi>
|
Someone <sergei.kozlukov@aalto.fi>
|
||||||
Someone Serge <sergei.kozlukov@aalto.fi>
|
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||||
|
@ -658,33 +491,25 @@ Stefan Sydow <stefan@sydow.email>
|
||||||
Steffen Röcker <sroecker@gmail.com>
|
Steffen Röcker <sroecker@gmail.com>
|
||||||
Stephan Walter <stephan@walter.name>
|
Stephan Walter <stephan@walter.name>
|
||||||
Stephen Nichols <snichols@users.noreply.github.com>
|
Stephen Nichols <snichols@users.noreply.github.com>
|
||||||
Steve Bonds <sbonds@gmail.com>
|
|
||||||
Steve Grubb <ausearch.1@gmail.com>
|
Steve Grubb <ausearch.1@gmail.com>
|
||||||
Steven Prichard <spprichard20@gmail.com>
|
Steven Prichard <spprichard20@gmail.com>
|
||||||
Steven Roussey <sroussey@gmail.com>
|
Steven Roussey <sroussey@gmail.com>
|
||||||
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||||
StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
|
|
||||||
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||||
Sukriti Sharma <Ssukriti@users.noreply.github.com>
|
|
||||||
SuperUserNameMan <yoann@terminajones.com>
|
SuperUserNameMan <yoann@terminajones.com>
|
||||||
Sutou Kouhei <kou@cozmixng.org>
|
|
||||||
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||||
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||||
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||||
Tamotsu Takahashi <ttakah+github@gmail.com>
|
Tamotsu Takahashi <ttakah+github@gmail.com>
|
||||||
Tei Home <taiteitonghome@proton.me>
|
|
||||||
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
|
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
|
||||||
Thatcher Chamberlin <j.thatcher.c@gmail.com>
|
Thatcher Chamberlin <j.thatcher.c@gmail.com>
|
||||||
Theia Vogel <theia@vgel.me>
|
Theia Vogel <theia@vgel.me>
|
||||||
Thérence <13496987+Royalphax@users.noreply.github.com>
|
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||||
Thibault Terrasson <thibault.terrasson@gmail.com>
|
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||||
Thomas Klausner <wiz@gatalith.at>
|
Thomas Klausner <wiz@gatalith.at>
|
||||||
Thorsten Sommer <SommerEngineering@users.noreply.github.com>
|
|
||||||
Tim Miller <drasticactions@users.noreply.github.com>
|
Tim Miller <drasticactions@users.noreply.github.com>
|
||||||
Tim Wang <overocean@gmail.com>
|
|
||||||
Timmy Knight <r2d2fish@gmail.com>
|
Timmy Knight <r2d2fish@gmail.com>
|
||||||
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||||
Ting Lou <louting@189.cn>
|
|
||||||
Ting Lou <ting.lou@gmail.com>
|
Ting Lou <ting.lou@gmail.com>
|
||||||
Ting Sun <suntcrick@gmail.com>
|
Ting Sun <suntcrick@gmail.com>
|
||||||
Tobias Lütke <tobi@shopify.com>
|
Tobias Lütke <tobi@shopify.com>
|
||||||
|
@ -692,44 +517,32 @@ Tom C <tom.corelis@gmail.com>
|
||||||
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||||
Tomas <tom.tomas.36478119@gmail.com>
|
Tomas <tom.tomas.36478119@gmail.com>
|
||||||
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||||
Tony Wasserka <4840017+neobrain@users.noreply.github.com>
|
|
||||||
Tristan Druyen <tristan@vault81.mozmail.com>
|
Tristan Druyen <tristan@vault81.mozmail.com>
|
||||||
Tristan Ross <rosscomputerguy@protonmail.com>
|
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||||
Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
|
|
||||||
Tungsten842 <886724vf@anonaddy.me>
|
Tungsten842 <886724vf@anonaddy.me>
|
||||||
Tungsten842 <quantmint@protonmail.com>
|
Tungsten842 <quantmint@protonmail.com>
|
||||||
Tushar <ditsuke@protonmail.com>
|
Tushar <ditsuke@protonmail.com>
|
||||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||||
Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
|
|
||||||
Ulrich Drepper <drepper@gmail.com>
|
Ulrich Drepper <drepper@gmail.com>
|
||||||
Uzo Nweke <uzoechi@gmail.com>
|
Uzo Nweke <uzoechi@gmail.com>
|
||||||
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||||
Val Kharitonov <mail@kharvd.com>
|
Val Kharitonov <mail@kharvd.com>
|
||||||
Valentin Konovalov <valle.ketsujin@gmail.com>
|
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||||
Valentin Mamedov <45292985+Inf1delis@users.noreply.github.com>
|
|
||||||
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||||
Vali Malinoiu <0x4139@gmail.com>
|
|
||||||
Victor Nogueira <felladrin@gmail.com>
|
Victor Nogueira <felladrin@gmail.com>
|
||||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||||
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
|
|
||||||
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
|
|
||||||
Vlad <spitfireage@gmail.com>
|
Vlad <spitfireage@gmail.com>
|
||||||
Vladimir <bogdad@gmail.com>
|
Vladimir <bogdad@gmail.com>
|
||||||
Vladimir Malyutin <first-leon@yandex.ru>
|
Vladimir Malyutin <first-leon@yandex.ru>
|
||||||
Vladimir Zorin <vladimir@deviant.guru>
|
Vladimir Zorin <vladimir@deviant.guru>
|
||||||
VoidIsVoid <343750470@qq.com>
|
|
||||||
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||||
Wang Qin <37098874+wangqin0@users.noreply.github.com>
|
|
||||||
Wang Ran (汪然) <wangr@smail.nju.edu.cn>
|
|
||||||
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||||
Weird Constructor <weirdconstructor@gmail.com>
|
Weird Constructor <weirdconstructor@gmail.com>
|
||||||
Welby Seely <welbyseely@gmail.com>
|
Welby Seely <welbyseely@gmail.com>
|
||||||
Wentai Zhang <rchardx@gmail.com>
|
Wentai Zhang <rchardx@gmail.com>
|
||||||
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||||
William Tambellini <william.tambellini@gmail.com>
|
William Tambellini <william.tambellini@gmail.com>
|
||||||
William Tambellini <wtambellini@sdl.com>
|
|
||||||
Willy Tarreau <w@1wt.eu>
|
Willy Tarreau <w@1wt.eu>
|
||||||
Woof Dog <197125663+woof-dog@users.noreply.github.com>
|
|
||||||
Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
|
Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
|
||||||
Wu Jian Ping <wujjpp@hotmail.com>
|
Wu Jian Ping <wujjpp@hotmail.com>
|
||||||
Wu Jian Ping <wujp@greatld.com>
|
Wu Jian Ping <wujp@greatld.com>
|
||||||
|
@ -738,25 +551,15 @@ Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||||
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||||
XiaotaoChen <chenxiaotao1234@gmail.com>
|
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||||
Xiaoyi Chen <cxychina@gmail.com>
|
Xiaoyi Chen <cxychina@gmail.com>
|
||||||
Xie Yanbo <xieyanbo@gmail.com>
|
|
||||||
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||||
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
|
|
||||||
Xuan Son Nguyen <thichthat@gmail.com>
|
Xuan Son Nguyen <thichthat@gmail.com>
|
||||||
Xuan-Son Nguyen <thichthat@gmail.com>
|
|
||||||
Yaiko <elyaiko@hotmail.com>
|
|
||||||
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||||
Yaroslav <yaroslav.yashin@me.com>
|
Yaroslav <yaroslav.yashin@me.com>
|
||||||
Yazan Agha-Schrader <mountaiin@icloud.com>
|
Yazan Agha-Schrader <mountaiin@icloud.com>
|
||||||
Yiming Cui <conandiy@vip.qq.com>
|
Yiming Cui <conandiy@vip.qq.com>
|
||||||
Yishuo Wang <MeouSker77@outlook.com>
|
Yishuo Wang <MeouSker77@outlook.com>
|
||||||
Yoshi Suhara <y.suhara@gmail.com>
|
|
||||||
Yoshi Suhara <ysuhara@nvidia.com>
|
|
||||||
Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
|
|
||||||
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||||
Yüg <eugeniosegalaweb@gmail.com>
|
|
||||||
Yui <dev@sleepyyui.com>
|
Yui <dev@sleepyyui.com>
|
||||||
Yun Dou <dixyes@gmail.com>
|
|
||||||
Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
|
|
||||||
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||||
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||||
ZHAOKAI WANG <sanxianwei@163.com>
|
ZHAOKAI WANG <sanxianwei@163.com>
|
||||||
|
@ -765,27 +568,19 @@ Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||||
Zenix <zenixls2@gmail.com>
|
Zenix <zenixls2@gmail.com>
|
||||||
Zhang Peiyuan <a1286225768@gmail.com>
|
Zhang Peiyuan <a1286225768@gmail.com>
|
||||||
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
||||||
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
|
|
||||||
Zhiyuan Li <lizhiyuan@uniartisan.com>
|
|
||||||
Zhiyuan Li <uniartisan2017@gmail.com>
|
|
||||||
ZhouYuChen <zhouyuchen@naver.com>
|
ZhouYuChen <zhouyuchen@naver.com>
|
||||||
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||||
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||||
Zsapi <martin1.zsapka@gmail.com>
|
Zsapi <martin1.zsapka@gmail.com>
|
||||||
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
||||||
a3sh <38979186+A3shTnT@users.noreply.github.com>
|
|
||||||
adel boussaken <netdur@gmail.com>
|
adel boussaken <netdur@gmail.com>
|
||||||
afrideva <95653597+afrideva@users.noreply.github.com>
|
afrideva <95653597+afrideva@users.noreply.github.com>
|
||||||
ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
|
|
||||||
agray3 <agray3@users.noreply.github.com>
|
agray3 <agray3@users.noreply.github.com>
|
||||||
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
||||||
alek3y <44779186+alek3y@users.noreply.github.com>
|
|
||||||
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||||
alonfaraj <alonfaraj@gmail.com>
|
alonfaraj <alonfaraj@gmail.com>
|
||||||
alwqx <kenan3015@gmail.com>
|
alwqx <kenan3015@gmail.com>
|
||||||
amd-dwang <dong.wang@amd.com>
|
|
||||||
amd-lalithnc <lalithnc@amd.com>
|
amd-lalithnc <lalithnc@amd.com>
|
||||||
amritahs-ibm <amritahs@linux.vnet.ibm.com>
|
|
||||||
andrijdavid <david@geek.mg>
|
andrijdavid <david@geek.mg>
|
||||||
anon998 <131767832+anon998@users.noreply.github.com>
|
anon998 <131767832+anon998@users.noreply.github.com>
|
||||||
anzz1 <anzz1@live.com>
|
anzz1 <anzz1@live.com>
|
||||||
|
@ -793,31 +588,24 @@ apaz <aarpazdera@gmail.com>
|
||||||
apcameron <37645737+apcameron@users.noreply.github.com>
|
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||||
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
||||||
arcrank <arcrank@gmail.com>
|
arcrank <arcrank@gmail.com>
|
||||||
ardfork <134447697+ardfork@users.noreply.github.com>
|
|
||||||
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||||
aryantandon01 <80969509+aryantandon01@users.noreply.github.com>
|
|
||||||
at8u <129688334+at8u@users.noreply.github.com>
|
at8u <129688334+at8u@users.noreply.github.com>
|
||||||
automaticcat <daogiatuank54@gmail.com>
|
automaticcat <daogiatuank54@gmail.com>
|
||||||
awatuna <23447591+awatuna@users.noreply.github.com>
|
|
||||||
b4b4o <zwbao@foxmail.com>
|
|
||||||
bandoti <141645996+bandoti@users.noreply.github.com>
|
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||||
beiller <beiller@gmail.com>
|
beiller <beiller@gmail.com>
|
||||||
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||||
bmwl <brian.marshall@tolko.com>
|
bmwl <brian.marshall@tolko.com>
|
||||||
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||||
brucepro <git@brucepro.net>
|
|
||||||
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||||
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||||
bssrdf <merlintiger@hotmail.com>
|
bssrdf <merlintiger@hotmail.com>
|
||||||
byte-6174 <88070277+byte-6174@users.noreply.github.com>
|
byte-6174 <88070277+byte-6174@users.noreply.github.com>
|
||||||
cduk <19917266+cduk@users.noreply.github.com>
|
|
||||||
cebtenzzre <cebtenzzre@gmail.com>
|
cebtenzzre <cebtenzzre@gmail.com>
|
||||||
chaihahaha <chai836275709@gmail.com>
|
chaihahaha <chai836275709@gmail.com>
|
||||||
chiranko <96988916+chiranko@users.noreply.github.com>
|
chiranko <96988916+chiranko@users.noreply.github.com>
|
||||||
clibdev <52199778+clibdev@users.noreply.github.com>
|
clibdev <52199778+clibdev@users.noreply.github.com>
|
||||||
clyang <clyang@clyang.net>
|
clyang <clyang@clyang.net>
|
||||||
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||||
codezjx <code.zjx@gmail.com>
|
|
||||||
coezbek <c.oezbek@gmail.com>
|
coezbek <c.oezbek@gmail.com>
|
||||||
comex <comexk@gmail.com>
|
comex <comexk@gmail.com>
|
||||||
compilade <113953597+compilade@users.noreply.github.com>
|
compilade <113953597+compilade@users.noreply.github.com>
|
||||||
|
@ -826,14 +614,10 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com>
|
||||||
crasm <crasm@git.vczf.net>
|
crasm <crasm@git.vczf.net>
|
||||||
crasm <crasm@git.vczf.us>
|
crasm <crasm@git.vczf.us>
|
||||||
daboe01 <daboe01@googlemail.com>
|
daboe01 <daboe01@googlemail.com>
|
||||||
daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
|
|
||||||
daminho <37615795+daminho@users.noreply.github.com>
|
|
||||||
david raistrick <keen99@users.noreply.github.com>
|
david raistrick <keen99@users.noreply.github.com>
|
||||||
ddh0 <dylanhalladay02@icloud.com>
|
ddh0 <dylanhalladay02@icloud.com>
|
||||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||||
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||||
devojony <61173062+devojony@users.noreply.github.com>
|
|
||||||
ditsuke <ditsuke@protonmail.com>
|
|
||||||
divinity76 <divinity76@gmail.com>
|
divinity76 <divinity76@gmail.com>
|
||||||
dm4 <sunrisedm4@gmail.com>
|
dm4 <sunrisedm4@gmail.com>
|
||||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||||
|
@ -841,25 +625,18 @@ drbh <david.richard.holtz@gmail.com>
|
||||||
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
||||||
dylan <canardleteer@users.noreply.github.com>
|
dylan <canardleteer@users.noreply.github.com>
|
||||||
eastriver <lee@eastriver.dev>
|
eastriver <lee@eastriver.dev>
|
||||||
ebraminio <ebrahim@gnu.org>
|
|
||||||
ebraminio <ebraminio@gmail.com>
|
ebraminio <ebraminio@gmail.com>
|
||||||
eiery <19350831+eiery@users.noreply.github.com>
|
eiery <19350831+eiery@users.noreply.github.com>
|
||||||
eric8607242 <e0928021388@gmail.com>
|
eric8607242 <e0928021388@gmail.com>
|
||||||
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
||||||
fengerhu1 <2748250768@qq.com>
|
|
||||||
fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
|
|
||||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||||
gliptic <gliptic@users.noreply.github.com>
|
gliptic <gliptic@users.noreply.github.com>
|
||||||
gn64 <yukikaze.jp@gmail.com>
|
|
||||||
goerch <jhr.walter@t-online.de>
|
goerch <jhr.walter@t-online.de>
|
||||||
grahameth <96447521+grahameth@users.noreply.github.com>
|
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||||
gtygo <gtydoit@gmail.com>
|
|
||||||
gwjr <502526+gwjr@users.noreply.github.com>
|
gwjr <502526+gwjr@users.noreply.github.com>
|
||||||
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||||
hankcs <cnhankmc@gmail.com>
|
hankcs <cnhankmc@gmail.com>
|
||||||
haopeng <657407891@qq.com>
|
|
||||||
hipudding <huafengchun@gmail.com>
|
|
||||||
hoangmit <hoangmit@users.noreply.github.com>
|
hoangmit <hoangmit@users.noreply.github.com>
|
||||||
hongbo.mo <352280764@qq.com>
|
hongbo.mo <352280764@qq.com>
|
||||||
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
||||||
|
@ -872,16 +649,12 @@ hxer7963 <hxer7963@gmail.com>
|
||||||
hydai <z54981220@gmail.com>
|
hydai <z54981220@gmail.com>
|
||||||
iSma <ismail.senhaji@gmail.com>
|
iSma <ismail.senhaji@gmail.com>
|
||||||
iacore <74560659+iacore@users.noreply.github.com>
|
iacore <74560659+iacore@users.noreply.github.com>
|
||||||
icppWorld <124377669+icppWorld@users.noreply.github.com>
|
|
||||||
igarnier <igarnier@protonmail.com>
|
igarnier <igarnier@protonmail.com>
|
||||||
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
||||||
iohub <rickyang.pro@gmail.com>
|
iohub <rickyang.pro@gmail.com>
|
||||||
issixx <46835150+issixx@users.noreply.github.com>
|
|
||||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||||
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
||||||
jameswu2014 <545426914@qq.com>
|
jameswu2014 <545426914@qq.com>
|
||||||
jdomke <28772296+jdomke@users.noreply.github.com>
|
|
||||||
jiahao su <damow890@gmail.com>
|
|
||||||
jiez <373447296@qq.com>
|
jiez <373447296@qq.com>
|
||||||
jneem <joeneeman@gmail.com>
|
jneem <joeneeman@gmail.com>
|
||||||
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
||||||
|
@ -894,7 +667,6 @@ junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
|
||||||
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||||
k.h.lai <adrian.k.h.lai@outlook.com>
|
k.h.lai <adrian.k.h.lai@outlook.com>
|
||||||
kaizau <kaizau@users.noreply.github.com>
|
kaizau <kaizau@users.noreply.github.com>
|
||||||
kallewoof <kalle.alm@gmail.com>
|
|
||||||
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
||||||
kang <tpdns9032100@gmail.com>
|
kang <tpdns9032100@gmail.com>
|
||||||
katsu560 <118887472+katsu560@users.noreply.github.com>
|
katsu560 <118887472+katsu560@users.noreply.github.com>
|
||||||
|
@ -902,46 +674,32 @@ kchro3 <62481661+kchro3@users.noreply.github.com>
|
||||||
khimaros <me@khimaros.com>
|
khimaros <me@khimaros.com>
|
||||||
kiltyj <kiltyj@gmail.com>
|
kiltyj <kiltyj@gmail.com>
|
||||||
klosax <131523366+klosax@users.noreply.github.com>
|
klosax <131523366+klosax@users.noreply.github.com>
|
||||||
krystiancha <krystian@krystianch.com>
|
|
||||||
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||||
kunnis <kunnis@users.noreply.github.com>
|
kunnis <kunnis@users.noreply.github.com>
|
||||||
kuronekosaiko <EvanChanJ@163.com>
|
kuronekosaiko <EvanChanJ@163.com>
|
||||||
kustaaya <58045274+kustaaya@users.noreply.github.com>
|
|
||||||
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||||
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||||
l3utterfly <gc.pthzfoldr@gmail.com>
|
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||||
laik <laik.lj@me.com>
|
|
||||||
ldwang <ftgreat@163.com>
|
ldwang <ftgreat@163.com>
|
||||||
le.chang <cljs118@126.com>
|
le.chang <cljs118@126.com>
|
||||||
leejet <leejet714@gmail.com>
|
leejet <leejet714@gmail.com>
|
||||||
leo-pony <nengjunma@outlook.com>
|
|
||||||
lexasub <lexakopp2212@gmail.com>
|
|
||||||
lhez <quic_lih@quicinc.com>
|
|
||||||
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||||
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
||||||
lon <114724657+longregen@users.noreply.github.com>
|
lon <114724657+longregen@users.noreply.github.com>
|
||||||
loonerin <132926317+loonerin@users.noreply.github.com>
|
loonerin <132926317+loonerin@users.noreply.github.com>
|
||||||
ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
|
|
||||||
luoyu-intel <yu.luo@intel.com>
|
luoyu-intel <yu.luo@intel.com>
|
||||||
m3ndax <adrian.goessl@outlook.com>
|
m3ndax <adrian.goessl@outlook.com>
|
||||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||||
mahorozte <41834471+mahorozte@users.noreply.github.com>
|
|
||||||
makomk <makosoft@googlemail.com>
|
makomk <makosoft@googlemail.com>
|
||||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||||
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
||||||
mashdragon <122402293+mashdragon@users.noreply.github.com>
|
|
||||||
matiaslin <45382001+matiaslin@users.noreply.github.com>
|
|
||||||
matt23654 <matthew.webber@protonmail.com>
|
|
||||||
matteo <matteogeniaccio@yahoo.it>
|
|
||||||
mdrokz <mohammadmunshi@gmail.com>
|
mdrokz <mohammadmunshi@gmail.com>
|
||||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||||
minarchist <minarchist@users.noreply.github.com>
|
minarchist <minarchist@users.noreply.github.com>
|
||||||
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||||
mmyjona <jonathan.gonse@gmail.com>
|
mmyjona <jonathan.gonse@gmail.com>
|
||||||
momonga <115213907+mmnga@users.noreply.github.com>
|
momonga <115213907+mmnga@users.noreply.github.com>
|
||||||
momonga <146910567+mmngays@users.noreply.github.com>
|
|
||||||
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||||
musoles <135031143+musoles@users.noreply.github.com>
|
|
||||||
mzcu <milos.cubrilo@gmail.com>
|
mzcu <milos.cubrilo@gmail.com>
|
||||||
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||||
ngc92 <7938269+ngc92@users.noreply.github.com>
|
ngc92 <7938269+ngc92@users.noreply.github.com>
|
||||||
|
@ -958,21 +716,16 @@ omahs <73983677+omahs@users.noreply.github.com>
|
||||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||||
opparco <parco.opaai@gmail.com>
|
opparco <parco.opaai@gmail.com>
|
||||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||||
pculliton <phillipculliton@gmail.com>
|
|
||||||
peidaqi <peidaqi@gmail.com>
|
|
||||||
pengxin99 <pengxin.yuan@intel.com>
|
pengxin99 <pengxin.yuan@intel.com>
|
||||||
perserk <perserk@gmail.com>
|
perserk <perserk@gmail.com>
|
||||||
piDack <104877312+piDack@users.noreply.github.com>
|
|
||||||
pmysl <piotr.myslinski@outlook.com>
|
pmysl <piotr.myslinski@outlook.com>
|
||||||
postmasters <namnguyen@google.com>
|
postmasters <namnguyen@google.com>
|
||||||
pudepiedj <pudepiedj@gmail.com>
|
pudepiedj <pudepiedj@gmail.com>
|
||||||
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
||||||
qingy1337 <qxli2@students.everettcc.edu>
|
|
||||||
qouoq <qouoq@fastmail.com>
|
qouoq <qouoq@fastmail.com>
|
||||||
qunash <anzoria@gmail.com>
|
qunash <anzoria@gmail.com>
|
||||||
rabidcopy <rabidcopy@yahoo.com>
|
rabidcopy <rabidcopy@yahoo.com>
|
||||||
rankaiyx <rankaiyx@rankaiyx.com>
|
rankaiyx <rankaiyx@rankaiyx.com>
|
||||||
redbeard <bharrington@alticon.net>
|
|
||||||
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
|
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
|
||||||
rhuddleston <ryan.huddleston@percona.com>
|
rhuddleston <ryan.huddleston@percona.com>
|
||||||
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
||||||
|
@ -980,7 +733,6 @@ runfuture <runfuture@users.noreply.github.com>
|
||||||
sandyiscool <sandyiscool@gmail.com>
|
sandyiscool <sandyiscool@gmail.com>
|
||||||
sasha0552 <admin@sasha0552.org>
|
sasha0552 <admin@sasha0552.org>
|
||||||
semidark <me@semidark.net>
|
semidark <me@semidark.net>
|
||||||
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
|
|
||||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||||
shibe2 <shibe@tuta.io>
|
shibe2 <shibe@tuta.io>
|
||||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||||
|
@ -989,59 +741,42 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
|
||||||
slaren <2141330+slaren@users.noreply.github.com>
|
slaren <2141330+slaren@users.noreply.github.com>
|
||||||
slaren <slarengh@gmail.com>
|
slaren <slarengh@gmail.com>
|
||||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||||
someone13574 <81528246+someone13574@users.noreply.github.com>
|
|
||||||
standby24x7 <standby24x7@gmail.com>
|
|
||||||
staviq <staviq@gmail.com>
|
staviq <staviq@gmail.com>
|
||||||
stduhpf <stephduh@live.fr>
|
stduhpf <stephduh@live.fr>
|
||||||
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
||||||
swittk <switt1995@gmail.com>
|
swittk <switt1995@gmail.com>
|
||||||
takov751 <40316768+takov751@users.noreply.github.com>
|
takov751 <40316768+takov751@users.noreply.github.com>
|
||||||
tarcey <cey.tarik@gmail.com>
|
tarcey <cey.tarik@gmail.com>
|
||||||
tc-mb <157115220+tc-mb@users.noreply.github.com>
|
|
||||||
texmex76 <40733439+texmex76@users.noreply.github.com>
|
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||||
thement <40525767+thement@users.noreply.github.com>
|
thement <40525767+thement@users.noreply.github.com>
|
||||||
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
|
|
||||||
tjohnman <tjohnman@users.noreply.github.com>
|
tjohnman <tjohnman@users.noreply.github.com>
|
||||||
toyer <2042519524@qq.com>
|
|
||||||
tslmy <tslmy@users.noreply.github.com>
|
tslmy <tslmy@users.noreply.github.com>
|
||||||
ubik2 <ubik2@users.noreply.github.com>
|
ubik2 <ubik2@users.noreply.github.com>
|
||||||
uint256_t <konndennsa@gmail.com>
|
uint256_t <konndennsa@gmail.com>
|
||||||
uint256_t <maekawatoshiki1017@gmail.com>
|
uint256_t <maekawatoshiki1017@gmail.com>
|
||||||
unbounded <haakon@likedan.net>
|
unbounded <haakon@likedan.net>
|
||||||
uvos <devnull@uvos.xyz>
|
|
||||||
uvos <philipp@uvos.xyz>
|
|
||||||
valiray <133289098+valiray@users.noreply.github.com>
|
valiray <133289098+valiray@users.noreply.github.com>
|
||||||
vb <vaibhavs10@gmail.com>
|
|
||||||
vik <vikhyatk@gmail.com>
|
vik <vikhyatk@gmail.com>
|
||||||
viric <viric@viric.name>
|
viric <viric@viric.name>
|
||||||
vodkaslime <646329483@qq.com>
|
vodkaslime <646329483@qq.com>
|
||||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||||
wangshuai09 <391746016@qq.com>
|
|
||||||
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||||
whoreson <139810751+whoreson@users.noreply.github.com>
|
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||||
woachk <24752637+woachk@users.noreply.github.com>
|
woachk <24752637+woachk@users.noreply.github.com>
|
||||||
wonjun Jang <strutive07@gmail.com>
|
wonjun Jang <strutive07@gmail.com>
|
||||||
woodx <124784234+woodx9@users.noreply.github.com>
|
woodx <124784234+woodx9@users.noreply.github.com>
|
||||||
wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
|
|
||||||
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||||
xaedes <xaedes@gmail.com>
|
xaedes <xaedes@gmail.com>
|
||||||
xaedes <xaedes@googlemail.com>
|
xaedes <xaedes@googlemail.com>
|
||||||
xctan <axunlei@gmail.com>
|
|
||||||
xloem <0xloem@gmail.com>
|
xloem <0xloem@gmail.com>
|
||||||
yangli2 <yangli2@gmail.com>
|
yangli2 <yangli2@gmail.com>
|
||||||
ymcki <84055651+ymcki@users.noreply.github.com>
|
|
||||||
yuiseki <yuiseki@gmail.com>
|
yuiseki <yuiseki@gmail.com>
|
||||||
yuri@FreeBSD <yurivict@users.noreply.github.com>
|
|
||||||
zakkor <edward.partenie@gmail.com>
|
zakkor <edward.partenie@gmail.com>
|
||||||
zhangkaihuo <zhangkaihuo@gmail.com>
|
zhangkaihuo <zhangkaihuo@gmail.com>
|
||||||
zhentaoyu <zhentao.yu@intel.com>
|
|
||||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||||
zhouwg <zhouwg2000@gmail.com>
|
zhouwg <zhouwg2000@gmail.com>
|
||||||
zrm <trustiosity.zrm@gmail.com>
|
zrm <trustiosity.zrm@gmail.com>
|
||||||
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
||||||
杨朱 · Kiki <baofa.fan@daocloud.io>
|
|
||||||
源文雨 <41315874+fumiama@users.noreply.github.com>
|
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||||
蕭澧邦 <45505768+shou692199@users.noreply.github.com>
|
|
||||||
谢乃闻 <sienaiwun@users.noreply.github.com>
|
|
||||||
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
||||||
|
|
104
CMakeLists.txt
104
CMakeLists.txt
|
@ -16,7 +16,6 @@ endif()
|
||||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||||
|
|
||||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
|
||||||
|
|
||||||
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||||
set(LLAMA_STANDALONE ON)
|
set(LLAMA_STANDALONE ON)
|
||||||
|
@ -47,13 +46,6 @@ if (WIN32)
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MSVC)
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# option list
|
# option list
|
||||||
#
|
#
|
||||||
|
@ -70,9 +62,6 @@ option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
|
||||||
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
||||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
# utils
|
|
||||||
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
|
|
||||||
|
|
||||||
# extra artifacts
|
# extra artifacts
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
|
@ -80,23 +69,24 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||||
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
|
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
|
||||||
|
|
||||||
# override ggml options
|
# override ggml options
|
||||||
|
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
||||||
|
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
||||||
|
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
||||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||||
|
|
||||||
# change the default for these ggml options
|
# change the default for these ggml options
|
||||||
if (NOT DEFINED GGML_LLAMAFILE)
|
if (NOT DEFINED GGML_LLAMAFILE)
|
||||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
set(GGML_LLAMAFILE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED GGML_CUDA_GRAPHS)
|
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
|
||||||
set(GGML_CUDA_GRAPHS_DEFAULT ON)
|
set(GGML_CUDA_USE_GRAPHS ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# transition helpers
|
# transition helpers
|
||||||
|
@ -118,62 +108,16 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||||
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
|
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
|
||||||
|
|
||||||
if (NOT MSVC)
|
|
||||||
if (LLAMA_SANITIZE_THREAD)
|
|
||||||
message(STATUS "Using -fsanitize=thread")
|
|
||||||
|
|
||||||
add_compile_options(-fsanitize=thread)
|
|
||||||
link_libraries (-fsanitize=thread)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_SANITIZE_ADDRESS)
|
|
||||||
message(STATUS "Using -fsanitize=address")
|
|
||||||
|
|
||||||
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
|
|
||||||
link_libraries (-fsanitize=address)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_SANITIZE_UNDEFINED)
|
|
||||||
message(STATUS "Using -fsanitize=undefined")
|
|
||||||
|
|
||||||
add_compile_options(-fsanitize=undefined)
|
|
||||||
link_libraries (-fsanitize=undefined)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# 3rd-party
|
# build the library
|
||||||
#
|
#
|
||||||
|
|
||||||
if (NOT TARGET ggml)
|
if (NOT TARGET ggml)
|
||||||
add_subdirectory(ggml)
|
add_subdirectory(ggml)
|
||||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# build the library
|
|
||||||
#
|
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
|
||||||
#
|
|
||||||
# utils, programs, examples and tests
|
|
||||||
#
|
|
||||||
|
|
||||||
if (LLAMA_BUILD_COMMON)
|
|
||||||
add_subdirectory(common)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
|
||||||
include(CTest)
|
|
||||||
add_subdirectory(tests)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
|
||||||
add_subdirectory(examples)
|
|
||||||
add_subdirectory(pocs)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# install
|
# install
|
||||||
#
|
#
|
||||||
|
@ -189,14 +133,18 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
|
||||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||||
|
|
||||||
set(LLAMA_PUBLIC_HEADERS
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
|
|
||||||
|
|
||||||
set_target_properties(llama
|
# At the moment some compile definitions are placed within the ggml/src
|
||||||
PROPERTIES
|
# directory but not exported on the `ggml` target. This could be improved by
|
||||||
PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
|
# determining _precisely_ which defines are necessary for the llama-config
|
||||||
|
# package.
|
||||||
|
#
|
||||||
|
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
|
||||||
|
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
||||||
|
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
|
||||||
|
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||||
|
|
||||||
|
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||||
|
|
||||||
configure_package_config_file(
|
configure_package_config_file(
|
||||||
|
@ -233,4 +181,20 @@ configure_file(cmake/llama.pc.in
|
||||||
@ONLY)
|
@ONLY)
|
||||||
|
|
||||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
DESTINATION lib/pkgconfig)
|
||||||
|
|
||||||
|
#
|
||||||
|
# programs, examples and tests
|
||||||
|
#
|
||||||
|
|
||||||
|
add_subdirectory(common)
|
||||||
|
|
||||||
|
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||||
|
include(CTest)
|
||||||
|
add_subdirectory(tests)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (LLAMA_BUILD_EXAMPLES)
|
||||||
|
add_subdirectory(examples)
|
||||||
|
add_subdirectory(pocs)
|
||||||
|
endif()
|
||||||
|
|
|
@ -28,20 +28,11 @@
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
|
||||||
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "x64-windows-llvm", "hidden": true,
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
"name": "arm64-windows-msvc", "hidden": true,
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||||
}
|
}
|
||||||
|
@ -50,48 +41,25 @@
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-llvm", "hidden": true,
|
"name": "arm64-windows-llvm", "hidden": true,
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
|
||||||
"name": "arm64-apple-clang", "hidden": true,
|
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
|
||||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
|
||||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
|
||||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
|
||||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
|
||||||
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
|
||||||
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
|
|
||||||
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
11
CODEOWNERS
11
CODEOWNERS
|
@ -1,11 +0,0 @@
|
||||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
|
||||||
|
|
||||||
/ci/ @ggerganov
|
|
||||||
/.devops/*.Dockerfile @ngxson
|
|
||||||
/examples/server/ @ngxson
|
|
||||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
|
||||||
/ggml/src/gguf.cpp @JohannesGaessler
|
|
111
CONTRIBUTING.md
111
CONTRIBUTING.md
|
@ -1,125 +1,28 @@
|
||||||
# Pull requests (for contributors)
|
# Pull requests (for contributors)
|
||||||
|
|
||||||
- Test your changes:
|
- Test your changes:
|
||||||
|
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||||
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
|
||||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
|
||||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
|
||||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
||||||
|
|
||||||
# Pull requests (for collaborators)
|
# Pull requests (for collaborators)
|
||||||
|
|
||||||
- Squash-merge PRs
|
- Squash-merge PRs
|
||||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
||||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
|
|
||||||
|
|
||||||
# Coding guidelines
|
# Coding guidelines
|
||||||
|
|
||||||
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
||||||
- Always consider cross-compatibility with other operating systems and architectures
|
- Always consider cross-compatibility with other operating systems and architectures
|
||||||
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
|
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
|
||||||
- Vertical alignment makes things more readable and easier to batch edit
|
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
|
||||||
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
||||||
- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
|
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
||||||
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
|
|
||||||
- In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
|
|
||||||
```cpp
|
|
||||||
// OK
|
|
||||||
llama_context * ctx;
|
|
||||||
const llama_rope_type rope_type;
|
|
||||||
|
|
||||||
// not OK
|
|
||||||
struct llama_context * ctx;
|
|
||||||
const enum llama_rope_type rope_type;
|
|
||||||
```
|
|
||||||
|
|
||||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
|
|
||||||
|
|
||||||
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
|
|
||||||
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
|
|
||||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
||||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
# Naming guidelines
|
|
||||||
|
|
||||||
- Use `snake_case` for function, variable and type names
|
|
||||||
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
// not OK
|
|
||||||
int small_number;
|
|
||||||
int big_number;
|
|
||||||
|
|
||||||
// OK
|
|
||||||
int number_small;
|
|
||||||
int number_big;
|
|
||||||
```
|
|
||||||
|
|
||||||
- Enum values are always in upper case and prefixed with the enum name
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
enum llama_vocab_type {
|
|
||||||
LLAMA_VOCAB_TYPE_NONE = 0,
|
|
||||||
LLAMA_VOCAB_TYPE_SPM = 1,
|
|
||||||
LLAMA_VOCAB_TYPE_BPE = 2,
|
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3,
|
|
||||||
LLAMA_VOCAB_TYPE_UGM = 4,
|
|
||||||
LLAMA_VOCAB_TYPE_RWKV = 5,
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
llama_model_init(); // class: "llama_model", method: "init"
|
|
||||||
llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
|
|
||||||
llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed"
|
|
||||||
llama_set_embeddings(); // class: "llama_context", method: "set_embeddings"
|
|
||||||
llama_n_threads(); // class: "llama_context", method: "n_threads"
|
|
||||||
llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free"
|
|
||||||
```
|
|
||||||
|
|
||||||
- The `get` `<action>` can be omitted
|
|
||||||
- The `<noun>` can be omitted if not necessary
|
|
||||||
- The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
|
|
||||||
- Use `init`/`free` for constructor/destructor `<action>`
|
|
||||||
|
|
||||||
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
typedef struct llama_context * llama_context_t;
|
|
||||||
|
|
||||||
enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
|
|
||||||
```
|
|
||||||
|
|
||||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
|
|
||||||
|
|
||||||
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
|
|
||||||
- Python filenames are all lowercase with underscores
|
|
||||||
|
|
||||||
- _(TODO: abbreviations usage)_
|
|
||||||
|
|
||||||
# Preprocessor directives
|
|
||||||
|
|
||||||
- _(TODO: add guidelines with examples and apply them to the codebase)_
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
#ifdef FOO
|
|
||||||
#endif // FOO
|
|
||||||
```
|
|
||||||
|
|
||||||
# Documentation
|
|
||||||
|
|
||||||
- Documentation is a community effort
|
|
||||||
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
|
|
||||||
- When you notice incorrect or outdated documentation, please update it
|
|
||||||
|
|
||||||
# Resources
|
|
||||||
|
|
||||||
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
|
|
||||||
|
|
||||||
https://github.com/ggerganov/llama.cpp/projects
|
|
||||||
|
|
|
@ -2,6 +2,48 @@
|
||||||
|
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
|
var sources = [
|
||||||
|
"src/llama.cpp",
|
||||||
|
"src/llama-vocab.cpp",
|
||||||
|
"src/llama-grammar.cpp",
|
||||||
|
"src/llama-sampling.cpp",
|
||||||
|
"src/unicode.cpp",
|
||||||
|
"src/unicode-data.cpp",
|
||||||
|
"ggml/src/ggml.c",
|
||||||
|
"ggml/src/ggml-alloc.c",
|
||||||
|
"ggml/src/ggml-backend.c",
|
||||||
|
"ggml/src/ggml-quants.c",
|
||||||
|
"ggml/src/ggml-aarch64.c",
|
||||||
|
]
|
||||||
|
|
||||||
|
var resources: [Resource] = []
|
||||||
|
var linkerSettings: [LinkerSetting] = []
|
||||||
|
var cSettings: [CSetting] = [
|
||||||
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
|
]
|
||||||
|
|
||||||
|
#if canImport(Darwin)
|
||||||
|
sources.append("ggml/src/ggml-metal.m")
|
||||||
|
resources.append(.process("ggml/src/ggml-metal.metal"))
|
||||||
|
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||||
|
cSettings.append(
|
||||||
|
contentsOf: [
|
||||||
|
.define("GGML_USE_ACCELERATE"),
|
||||||
|
.define("GGML_USE_METAL")
|
||||||
|
]
|
||||||
|
)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if os(Linux)
|
||||||
|
cSettings.append(.define("_GNU_SOURCE"))
|
||||||
|
#endif
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
platforms: [
|
platforms: [
|
||||||
|
@ -14,6 +56,24 @@ let package = Package(
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "llama", targets: ["llama"]),
|
||||||
],
|
],
|
||||||
targets: [
|
targets: [
|
||||||
.systemLibrary(name: "llama", pkgConfig: "llama"),
|
.target(
|
||||||
]
|
name: "llama",
|
||||||
|
path: ".",
|
||||||
|
exclude: [
|
||||||
|
"cmake",
|
||||||
|
"examples",
|
||||||
|
"scripts",
|
||||||
|
"models",
|
||||||
|
"tests",
|
||||||
|
"CMakeLists.txt",
|
||||||
|
"Makefile"
|
||||||
|
],
|
||||||
|
sources: sources,
|
||||||
|
resources: resources,
|
||||||
|
publicHeadersPath: "spm-headers",
|
||||||
|
cSettings: cSettings,
|
||||||
|
linkerSettings: linkerSettings
|
||||||
|
)
|
||||||
|
],
|
||||||
|
cxxLanguageStandard: .cxx11
|
||||||
)
|
)
|
||||||
|
|
586
README.md
586
README.md
|
@ -4,52 +4,62 @@
|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||||
|
[](https://conan.io/center/llama-cpp)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||||
|
|
||||||
## Recent API changes
|
## Recent API changes
|
||||||
|
|
||||||
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
|
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
|
||||||
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
|
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
||||||
|
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
||||||
|
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
||||||
|
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
|
||||||
|
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
|
||||||
|
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
|
||||||
|
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
|
||||||
|
|
||||||
## Hot topics
|
## Hot topics
|
||||||
|
|
||||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427
|
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
|
||||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||||
- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639
|
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
||||||
- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
|
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
||||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
|
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
|
||||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
|
||||||
|
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
|
||||||
|
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
|
||||||
|
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||||
range of hardware - locally and in the cloud.
|
variety of hardware - locally and in the cloud.
|
||||||
|
|
||||||
- Plain C/C++ implementation without any dependencies
|
- Plain C/C++ implementation without any dependencies
|
||||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||||
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
|
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan and SYCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
|
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
||||||
|
improved significantly thanks to many contributions. It is the main playground for developing new features for the
|
||||||
|
[ggml](https://github.com/ggerganov/ggml) library.
|
||||||
|
|
||||||
<details>
|
**Supported models:**
|
||||||
<summary>Models</summary>
|
|
||||||
|
|
||||||
Typically finetunes of the base models below are supported as well.
|
Typically finetunes of the base models below are supported as well.
|
||||||
|
|
||||||
Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
|
|
||||||
|
|
||||||
#### Text-only
|
|
||||||
|
|
||||||
- [X] LLaMA 🦙
|
- [X] LLaMA 🦙
|
||||||
- [x] LLaMA 2 🦙🦙
|
- [x] LLaMA 2 🦙🦙
|
||||||
- [x] LLaMA 3 🦙🦙🦙
|
- [x] LLaMA 3 🦙🦙🦙
|
||||||
|
@ -73,7 +83,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||||
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
||||||
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
|
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
|
||||||
- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
|
|
||||||
- [x] [GPT-2](https://huggingface.co/gpt2)
|
- [x] [GPT-2](https://huggingface.co/gpt2)
|
||||||
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
||||||
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
||||||
|
@ -86,27 +95,12 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
- [x] [OLMo](https://allenai.org/olmo)
|
- [x] [OLMo](https://allenai.org/olmo)
|
||||||
- [x] [OLMo 2](https://allenai.org/olmo)
|
|
||||||
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
|
|
||||||
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
|
|
||||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||||
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
|
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
||||||
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
|
|
||||||
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
|
|
||||||
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
|
|
||||||
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
|
|
||||||
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
|
|
||||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
|
|
||||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
|
||||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
|
||||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
|
||||||
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
|
||||||
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
|
||||||
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
|
||||||
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
|
|
||||||
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
|
||||||
|
|
||||||
#### Multimodal
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||||
|
|
||||||
|
**Multimodal models:**
|
||||||
|
|
||||||
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
||||||
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||||
|
@ -117,169 +111,205 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||||
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||||
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
|
|
||||||
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
|
|
||||||
|
|
||||||
</details>
|
**Bindings:**
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Bindings</summary>
|
|
||||||
|
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||||
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
|
|
||||||
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
||||||
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
|
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
|
||||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||||
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
|
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
|
||||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||||
- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
|
|
||||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||||
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
|
|
||||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
||||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||||
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
|
|
||||||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
||||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
|
||||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
|
||||||
|
|
||||||
</details>
|
**UI:**
|
||||||
|
|
||||||
<details>
|
Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
<summary>UIs</summary>
|
|
||||||
|
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||||
|
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
||||||
|
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||||
|
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||||
|
- [Faraday](https://faraday.dev/) (proprietary)
|
||||||
|
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||||
|
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
|
||||||
|
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||||
|
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||||
|
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
||||||
|
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
||||||
|
- [ollama/ollama](https://github.com/ollama/ollama)
|
||||||
|
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
||||||
|
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||||
|
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||||
|
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||||
|
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||||
|
- [RAGNA Desktop](https://ragna.app/) (proprietary)
|
||||||
|
- [RecurseChat](https://recurse.chat/) (proprietary)
|
||||||
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
|
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||||
|
- [Msty](https://msty.app) (proprietary)
|
||||||
|
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||||
|
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
|
||||||
|
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||||
|
- [MindMac](https://mindmac.app) (proprietary)
|
||||||
|
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||||
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
|
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
**Tools:**
|
||||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
|
||||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
|
||||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
|
||||||
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
|
|
||||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
|
||||||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
|
|
||||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
|
||||||
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
|
|
||||||
- [LARS](https://github.com/abgulati/LARS) (AGPL)
|
|
||||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
|
||||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
|
||||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
|
||||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
|
||||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
|
||||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
|
||||||
- [MindMac](https://mindmac.app) (proprietary)
|
|
||||||
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
|
||||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
|
||||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
|
|
||||||
- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
|
|
||||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
|
|
||||||
- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
|
|
||||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
|
||||||
- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
|
||||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
|
|
||||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
|
|
||||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
|
||||||
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
|
||||||
- [semperai/amica](https://github.com/semperai/amica) (MIT)
|
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
|
|
||||||
- [Autopen](https://github.com/blackhole89/autopen) (GPL)
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Tools</summary>
|
|
||||||
|
|
||||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
|
|
||||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
|
||||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
|
|
||||||
|
|
||||||
</details>
|
**Infrastructure:**
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Infrastructure</summary>
|
|
||||||
|
|
||||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
|
||||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
**Games:**
|
||||||
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
|
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
||||||
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
|
|
||||||
|
## Demo
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
|
||||||
|
|
||||||
|
```
|
||||||
|
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||||
|
I llama.cpp build info:
|
||||||
|
I UNAME_S: Darwin
|
||||||
|
I UNAME_P: arm
|
||||||
|
I UNAME_M: arm64
|
||||||
|
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
|
||||||
|
I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
|
||||||
|
I LDFLAGS: -framework Accelerate
|
||||||
|
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
||||||
|
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
||||||
|
|
||||||
|
make: Nothing to be done for `default'.
|
||||||
|
main: build = 1041 (cf658ad)
|
||||||
|
main: seed = 1692823051
|
||||||
|
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
||||||
|
llama_model_loader: - type f32: 81 tensors
|
||||||
|
llama_model_loader: - type q4_0: 281 tensors
|
||||||
|
llama_model_loader: - type q6_K: 1 tensors
|
||||||
|
llm_load_print_meta: format = GGUF V1 (latest)
|
||||||
|
llm_load_print_meta: arch = llama
|
||||||
|
llm_load_print_meta: vocab type = SPM
|
||||||
|
llm_load_print_meta: n_vocab = 32000
|
||||||
|
llm_load_print_meta: n_merges = 0
|
||||||
|
llm_load_print_meta: n_ctx_train = 4096
|
||||||
|
llm_load_print_meta: n_ctx = 512
|
||||||
|
llm_load_print_meta: n_embd = 5120
|
||||||
|
llm_load_print_meta: n_head = 40
|
||||||
|
llm_load_print_meta: n_head_kv = 40
|
||||||
|
llm_load_print_meta: n_layer = 40
|
||||||
|
llm_load_print_meta: n_rot = 128
|
||||||
|
llm_load_print_meta: n_gqa = 1
|
||||||
|
llm_load_print_meta: f_norm_eps = 1.0e-05
|
||||||
|
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
|
||||||
|
llm_load_print_meta: n_ff = 13824
|
||||||
|
llm_load_print_meta: freq_base = 10000.0
|
||||||
|
llm_load_print_meta: freq_scale = 1
|
||||||
|
llm_load_print_meta: model type = 13B
|
||||||
|
llm_load_print_meta: model ftype = mostly Q4_0
|
||||||
|
llm_load_print_meta: model size = 13.02 B
|
||||||
|
llm_load_print_meta: general.name = LLaMA v2
|
||||||
|
llm_load_print_meta: BOS token = 1 '<s>'
|
||||||
|
llm_load_print_meta: EOS token = 2 '</s>'
|
||||||
|
llm_load_print_meta: UNK token = 0 '<unk>'
|
||||||
|
llm_load_print_meta: LF token = 13 '<0x0A>'
|
||||||
|
llm_load_tensors: ggml ctx size = 0.11 MB
|
||||||
|
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
|
||||||
|
...................................................................................................
|
||||||
|
llama_new_context_with_model: kv self size = 400.00 MB
|
||||||
|
llama_new_context_with_model: compute buffer total size = 75.41 MB
|
||||||
|
|
||||||
|
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
||||||
|
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
||||||
|
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
|
||||||
|
|
||||||
|
|
||||||
|
Building a website can be done in 10 simple steps:
|
||||||
|
Step 1: Find the right website platform.
|
||||||
|
Step 2: Choose your domain name and hosting plan.
|
||||||
|
Step 3: Design your website layout.
|
||||||
|
Step 4: Write your website content and add images.
|
||||||
|
Step 5: Install security features to protect your site from hackers or spammers
|
||||||
|
Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
|
||||||
|
Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
|
||||||
|
Step 8: Start marketing and promoting the website via social media channels or paid ads
|
||||||
|
Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
|
||||||
|
Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
|
||||||
|
How does a Website Work?
|
||||||
|
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
|
||||||
|
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
|
||||||
|
How to
|
||||||
|
llama_print_timings: load time = 576.45 ms
|
||||||
|
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
||||||
|
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
||||||
|
llama_print_timings: total time = 25431.49 ms
|
||||||
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Games</summary>
|
<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
|
||||||
|
|
||||||
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
|
||||||
|
|
||||||
|
https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
## Supported backends
|
## Usage
|
||||||
|
|
||||||
| Backend | Target devices |
|
Here are the end-to-end binary build and model conversion steps for most supported models.
|
||||||
| --- | --- |
|
|
||||||
| [Metal](docs/build.md#metal-build) | Apple Silicon |
|
|
||||||
| [BLAS](docs/build.md#blas-build) | All |
|
|
||||||
| [BLIS](docs/backend/BLIS.md) | All |
|
|
||||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
|
||||||
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
|
|
||||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
|
||||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
|
||||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
|
||||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
|
||||||
|
|
||||||
## Building the project
|
### Basic usage
|
||||||
|
|
||||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
Firstly, you need to get the binary. There are different methods that you can follow:
|
||||||
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
|
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
|
||||||
|
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
|
||||||
|
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
|
||||||
|
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
||||||
|
|
||||||
- Clone this repository and build locally, see [how to build](docs/build.md)
|
You can run a basic completion using this command:
|
||||||
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
|
|
||||||
- Use a Docker image, see [documentation for Docker](docs/docker.md)
|
|
||||||
- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
|
||||||
|
|
||||||
## Obtaining and quantizing models
|
|
||||||
|
|
||||||
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
|
|
||||||
|
|
||||||
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
|
||||||
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
|
||||||
|
|
||||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
|
|
||||||
|
|
||||||
After downloading a model, use the CLI tools to run it locally - see below.
|
|
||||||
|
|
||||||
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
|
|
||||||
|
|
||||||
The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
|
|
||||||
|
|
||||||
- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
|
|
||||||
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
|
|
||||||
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
|
|
||||||
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
|
|
||||||
|
|
||||||
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
|
|
||||||
|
|
||||||
## [`llama-cli`](examples/main)
|
|
||||||
|
|
||||||
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
|
||||||
|
|
||||||
- <details open>
|
|
||||||
<summary>Run in conversation mode</summary>
|
|
||||||
|
|
||||||
Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama-cli -m model.gguf
|
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
||||||
|
```
|
||||||
|
|
||||||
|
See [this page](./examples/main/README.md) for a full list of parameters.
|
||||||
|
|
||||||
|
### Conversation mode
|
||||||
|
|
||||||
|
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
||||||
|
|
||||||
|
# Output:
|
||||||
# > hi, who are you?
|
# > hi, who are you?
|
||||||
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
|
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
|
||||||
#
|
#
|
||||||
|
@ -287,226 +317,151 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||||
# Easy peasy! The answer to 1+1 is... 2!
|
# Easy peasy! The answer to 1+1 is... 2!
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Run in conversation mode with custom chat template</summary>
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# use the "chatml" template (use -h to see the list of supported templates)
|
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
||||||
llama-cli -m model.gguf -cnv --chat-template chatml
|
|
||||||
|
|
||||||
# use a custom template
|
|
||||||
llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Run simple text completion</summary>
|
|
||||||
|
|
||||||
To disable conversation mode explicitly, use `-no-cnv`
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
|
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
||||||
|
|
||||||
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
### Web server
|
||||||
|
|
||||||
- <details>
|
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||||
<summary>Constrain the output with a custom grammar</summary>
|
|
||||||
|
Example usage:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
./llama-server -m your_model.gguf --port 8080
|
||||||
|
|
||||||
# {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
|
|
||||||
```
|
|
||||||
|
|
||||||
The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
|
|
||||||
|
|
||||||
For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
|
|
||||||
## [`llama-server`](examples/server)
|
|
||||||
|
|
||||||
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
|
|
||||||
|
|
||||||
- <details open>
|
|
||||||
<summary>Start a local HTTP server with default configuration on port 8080</summary>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama-server -m model.gguf --port 8080
|
|
||||||
|
|
||||||
# Basic web UI can be accessed via browser: http://localhost:8080
|
# Basic web UI can be accessed via browser: http://localhost:8080
|
||||||
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
### Interactive mode
|
||||||
|
|
||||||
- <details>
|
> [!NOTE]
|
||||||
<summary>Support multiple-users and parallel decoding</summary>
|
> If you prefer basic usage, please consider using conversation mode instead of interactive mode
|
||||||
|
|
||||||
|
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
|
||||||
|
|
||||||
|
Here is an example of a few-shot interaction, invoked with the command
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# up to 4 concurrent requests, each with 4096 max context
|
# default arguments using a 7B model
|
||||||
llama-server -m model.gguf -c 16384 -np 4
|
./examples/chat.sh
|
||||||
|
|
||||||
|
# advanced chat with a 13B model
|
||||||
|
./examples/chat-13B.sh
|
||||||
|
|
||||||
|
# custom arguments using a 13B model
|
||||||
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
||||||
|
|
||||||
- <details>
|

|
||||||
<summary>Enable speculative decoding</summary>
|
|
||||||
|
### Persistent Interaction
|
||||||
|
|
||||||
|
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# the draft.gguf model should be a small variant of the target model.gguf
|
# Start a new chat
|
||||||
llama-server -m model.gguf -md draft.gguf
|
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
|
||||||
|
|
||||||
|
# Resume that chat
|
||||||
|
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
|
||||||
|
|
||||||
|
# Start a different chat with the same prompt/model
|
||||||
|
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
|
||||||
|
|
||||||
|
# Different prompt cache for different prompt/model
|
||||||
|
PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
|
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
### Constrained output with grammars
|
||||||
|
|
||||||
- <details>
|
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||||
<summary>Serve an embedding model</summary>
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# use the /embedding endpoint
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
llama-server -m model.gguf --embedding --pooling cls -ub 8192
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
|
||||||
- <details>
|
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
||||||
<summary>Serve a reranking model</summary>
|
|
||||||
|
|
||||||
```bash
|
## Build
|
||||||
# use the /reranking endpoint
|
|
||||||
llama-server -m model.gguf --reranking
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
Please refer to [Build llama.cpp locally](./docs/build.md)
|
||||||
|
|
||||||
- <details>
|
## Supported backends
|
||||||
<summary>Constrain all outputs with a grammar</summary>
|
|
||||||
|
|
||||||
```bash
|
| Backend | Target devices |
|
||||||
# custom grammar
|
| --- | --- |
|
||||||
llama-server -m model.gguf --grammar-file grammar.gbnf
|
| [Metal](./docs/build.md#metal-build) | Apple Silicon |
|
||||||
|
| [BLAS](./docs/build.md#blas-build) | All |
|
||||||
|
| [BLIS](./docs/backend/BLIS.md) | All |
|
||||||
|
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||||
|
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
|
||||||
|
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
|
||||||
|
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
|
||||||
|
| [Vulkan](./docs/build.md#vulkan) | GPU |
|
||||||
|
|
||||||
# JSON
|
## Tools
|
||||||
llama-server -m model.gguf --grammar-file grammars/json.gbnf
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
### Prepare and Quantize
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
||||||
|
|
||||||
## [`llama-perplexity`](examples/perplexity)
|
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||||
|
|
||||||
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
|
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
||||||
|
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||||
|
|
||||||
- <details open>
|
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
|
||||||
<summary>Measure the perplexity over a text file</summary>
|
|
||||||
|
|
||||||
```bash
|
### Perplexity (measuring model quality)
|
||||||
llama-perplexity -m model.gguf -f file.txt
|
|
||||||
|
|
||||||
# [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
|
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
|
||||||
# Final estimate: PPL = 5.4007 +/- 0.67339
|
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Measure KL divergence</summary>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# TODO
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
|
|
||||||
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
|
||||||
|
|
||||||
## [`llama-bench`](examples/llama-bench)
|
|
||||||
|
|
||||||
#### Benchmark the performance of the inference for various parameters.
|
|
||||||
|
|
||||||
- <details open>
|
|
||||||
<summary>Run default benchmark</summary>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama-bench -m model.gguf
|
|
||||||
|
|
||||||
# Output:
|
|
||||||
# | model | size | params | backend | threads | test | t/s |
|
|
||||||
# | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
|
|
||||||
# | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | pp512 | 5765.41 ± 20.55 |
|
|
||||||
# | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | tg128 | 197.71 ± 0.81 |
|
|
||||||
#
|
|
||||||
# build: 3e0ba0e60 (4229)
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## [`llama-run`](examples/run)
|
|
||||||
|
|
||||||
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama-run granite-code
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
[^3]: [RamaLama](https://github.com/containers/ramalama)
|
|
||||||
|
|
||||||
## [`llama-simple`](examples/simple)
|
|
||||||
|
|
||||||
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Basic text completion</summary>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama-simple -m model.gguf
|
|
||||||
|
|
||||||
# Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
|
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
- Contributors can open PRs
|
- Contributors can open PRs
|
||||||
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
||||||
- Collaborators will be invited based on contributions
|
- Collaborators will be invited based on contributions
|
||||||
- Any help with managing issues, PRs and projects is very appreciated!
|
- Any help with managing issues and PRs is very appreciated!
|
||||||
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||||
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
|
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
|
||||||
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
||||||
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
||||||
|
|
||||||
## Other documentation
|
## Other documentations
|
||||||
|
|
||||||
- [main (cli)](examples/main/README.md)
|
- [main (cli)](./examples/main/README.md)
|
||||||
- [server](examples/server/README.md)
|
- [server](./examples/server/README.md)
|
||||||
- [GBNF grammars](grammars/README.md)
|
- [jeopardy](./examples/jeopardy/README.md)
|
||||||
|
- [GBNF grammars](./grammars/README.md)
|
||||||
|
|
||||||
#### Development documentation
|
**Development documentations**
|
||||||
|
|
||||||
- [How to build](docs/build.md)
|
- [How to build](./docs/build.md)
|
||||||
- [Running on Docker](docs/docker.md)
|
- [Running on Docker](./docs/docker.md)
|
||||||
- [Build on Android](docs/android.md)
|
- [Build on Android](./docs/android.md)
|
||||||
- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
|
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
|
||||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||||
|
|
||||||
#### Seminal papers and background on the models
|
**Seminal papers and background on the models**
|
||||||
|
|
||||||
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
|
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
|
||||||
- LLaMA:
|
- LLaMA:
|
||||||
|
@ -517,6 +472,3 @@ If your issue is with model generation quality, then please at least scan the fo
|
||||||
- GPT-3.5 / InstructGPT / ChatGPT:
|
- GPT-3.5 / InstructGPT / ChatGPT:
|
||||||
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
||||||
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
||||||
|
|
||||||
#### References
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <llama.h>
|
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
module llama [system] {
|
|
||||||
header "llama.h"
|
|
||||||
link "llama"
|
|
||||||
export *
|
|
||||||
}
|
|
291
ci/run.sh
291
ci/run.sh
|
@ -1,4 +1,4 @@
|
||||||
#!/bin/bash
|
#/bin/bash
|
||||||
#
|
#
|
||||||
# sample usage:
|
# sample usage:
|
||||||
#
|
#
|
||||||
|
@ -13,9 +13,6 @@
|
||||||
# # with SYCL support
|
# # with SYCL support
|
||||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
# # with VULKAN support
|
|
||||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
#
|
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
|
@ -39,11 +36,11 @@ SRC=`pwd`
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
|
@ -53,11 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
|
||||||
fi
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
|
@ -114,7 +107,7 @@ function gg_run_ctest_debug {
|
||||||
gg_check_build_requirements
|
gg_check_build_requirements
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
|
@ -145,7 +138,7 @@ function gg_run_ctest_release {
|
||||||
gg_check_build_requirements
|
gg_check_build_requirements
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
@ -273,6 +266,7 @@ function gg_sum_ctest_with_model_release {
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_7b_v2
|
||||||
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
function gg_run_open_llama_7b_v2 {
|
function gg_run_open_llama_7b_v2 {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
@ -296,8 +290,8 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
@ -326,36 +320,36 @@ function gg_run_open_llama_7b_v2 {
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -431,7 +425,7 @@ function gg_run_pythia_1_4b {
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
@ -460,34 +454,34 @@ function gg_run_pythia_1_4b {
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -541,6 +535,7 @@ function gg_sum_pythia_1_4b {
|
||||||
}
|
}
|
||||||
|
|
||||||
# pythia_2_8b
|
# pythia_2_8b
|
||||||
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
function gg_run_pythia_2_8b {
|
function gg_run_pythia_2_8b {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
@ -561,8 +556,8 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
@ -591,36 +586,36 @@ function gg_run_pythia_2_8b {
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -697,7 +692,7 @@ function gg_run_embd_bge_small {
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
@ -706,88 +701,12 @@ function gg_run_embd_bge_small {
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_sum_embd_bge_small {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'BGE Small (BERT):\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# rerank_tiny
|
|
||||||
|
|
||||||
function gg_run_rerank_tiny {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
|
|
||||||
|
|
||||||
path_models="../models-mnt/rerank-tiny"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
|
|
||||||
# sample output
|
|
||||||
# rerank score 0: 0.029
|
|
||||||
# rerank score 1: 0.029
|
|
||||||
# rerank score 2: 0.135
|
|
||||||
|
|
||||||
# check that the score is in the range [$3, $4]
|
|
||||||
function check_score {
|
|
||||||
qnt="$1"
|
|
||||||
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$score"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_rerank_tiny {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Rerank Tiny (Jina):\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_check_build_requirements {
|
function gg_check_build_requirements {
|
||||||
if ! command -v cmake &> /dev/null; then
|
if ! command -v cmake &> /dev/null; then
|
||||||
gg_printf 'cmake not found, please install'
|
gg_printf 'cmake not found, please install'
|
||||||
|
@ -802,10 +721,16 @@ function gg_check_build_requirements {
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
## main
|
function gg_sum_embd_bge_small {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
export LLAMA_LOG_PREFIX=1
|
gg_printf 'BGE Small (BERT):\n'
|
||||||
export LLAMA_LOG_TIMESTAMPS=1
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
}
|
||||||
|
|
||||||
|
## main
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||||
|
@ -815,10 +740,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
# Create a fresh python3 venv and enter it
|
# Create a fresh python3 venv and enter it
|
||||||
if ! python3 -m venv "$MNT/venv"; then
|
python3 -m venv "$MNT/venv"
|
||||||
echo "Error: Failed to create Python virtual environment at $MNT/venv."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "$MNT/venv/bin/activate"
|
source "$MNT/venv/bin/activate"
|
||||||
|
|
||||||
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
||||||
|
@ -832,7 +754,6 @@ test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
test $ret -eq 0 && gg_run embd_bge_small
|
test $ret -eq 0 && gg_run embd_bge_small
|
||||||
test $ret -eq 0 && gg_run rerank_tiny
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
||||||
test $ret -eq 0 && gg_run test_scripts_debug
|
test $ret -eq 0 && gg_run test_scripts_debug
|
||||||
|
@ -840,7 +761,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run pythia_1_4b
|
test $ret -eq 0 && gg_run pythia_1_4b
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run pythia_2_8b
|
test $ret -eq 0 && gg_run pythia_2_8b
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Darwin )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
||||||
|
|
||||||
set( target arm64-apple-darwin-macho )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
|
@ -44,7 +44,7 @@ if(MSVC)
|
||||||
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
||||||
else()
|
else()
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
|
COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
|
||||||
OUTPUT_VARIABLE OUT
|
OUTPUT_VARIABLE OUT
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,33 +0,0 @@
|
||||||
function(llama_add_compile_flags)
|
|
||||||
if (LLAMA_FATAL_WARNINGS)
|
|
||||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|
||||||
list(APPEND C_FLAGS -Werror)
|
|
||||||
list(APPEND CXX_FLAGS -Werror)
|
|
||||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
|
||||||
add_compile_options(/WX)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS)
|
|
||||||
if (NOT MSVC)
|
|
||||||
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
|
||||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
|
||||||
|
|
||||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
|
||||||
|
|
||||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
|
||||||
|
|
||||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
|
||||||
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
|
||||||
|
|
||||||
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
|
||||||
else()
|
|
||||||
# todo : msvc
|
|
||||||
set(C_FLAGS "" PARENT_SCOPE)
|
|
||||||
set(CXX_FLAGS "" PARENT_SCOPE)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
endfunction()
|
|
|
@ -3,28 +3,88 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
|
|
||||||
|
set(GGML_BLAS @GGML_BLAS@)
|
||||||
|
set(GGML_CUDA @GGML_CUDA@)
|
||||||
|
set(GGML_METAL @GGML_METAL@)
|
||||||
|
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||||
|
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||||
|
set(GGML_VULKAN @GGML_VULKAN@)
|
||||||
|
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
|
||||||
|
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
|
||||||
|
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
|
||||||
|
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
|
||||||
|
set(GGML_SYCL @GGML_SYCL@)
|
||||||
|
set(GGML_OPENMP @GGML_OPENMP@)
|
||||||
|
|
||||||
@PACKAGE_INIT@
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||||
|
|
||||||
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
|
# Ensure transient dependencies satisfied
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
|
if (APPLE AND GGML_ACCELERATE)
|
||||||
|
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_BLAS)
|
||||||
|
find_package(BLAS REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA)
|
||||||
|
find_package(CUDAToolkit REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_METAL)
|
||||||
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_VULKAN)
|
||||||
|
find_package(Vulkan REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_HIPBLAS)
|
||||||
|
find_package(hip REQUIRED)
|
||||||
|
find_package(hipblas REQUIRED)
|
||||||
|
find_package(rocblas REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_SYCL)
|
||||||
|
find_package(IntelSYCL REQUIRED)
|
||||||
|
find_package(MKL REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_OPENMP)
|
||||||
|
find_package(OpenMP REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
find_library(ggml_LIBRARY ggml
|
||||||
|
REQUIRED
|
||||||
|
HINTS ${LLAMA_LIB_DIR})
|
||||||
|
|
||||||
find_library(llama_LIBRARY llama
|
find_library(llama_LIBRARY llama
|
||||||
REQUIRED
|
REQUIRED
|
||||||
HINTS ${LLAMA_LIB_DIR}
|
HINTS ${LLAMA_LIB_DIR})
|
||||||
NO_CMAKE_FIND_ROOT_PATH
|
|
||||||
)
|
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
|
||||||
|
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
||||||
|
|
||||||
add_library(llama UNKNOWN IMPORTED)
|
add_library(llama UNKNOWN IMPORTED)
|
||||||
|
|
||||||
set_target_properties(llama
|
set_target_properties(llama
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||||
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
|
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
||||||
|
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
|
||||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||||
INTERFACE_COMPILE_FEATURES c_std_90
|
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||||
POSITION_INDEPENDENT_CODE ON )
|
POSITION_INDEPENDENT_CODE ON )
|
||||||
|
|
||||||
check_required_components(Llama)
|
check_required_components(Llama)
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
prefix=@CMAKE_INSTALL_PREFIX@
|
prefix=@CMAKE_INSTALL_PREFIX@
|
||||||
exec_prefix=@CMAKE_INSTALL_PREFIX@
|
exec_prefix=${prefix}
|
||||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
libdir=${exec_prefix}/lib
|
||||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
includedir=${prefix}/include
|
||||||
|
|
||||||
Name: llama
|
Name: llama
|
||||||
Description: Port of Facebook's LLaMA model in C/C++
|
Description: Port of Facebook's LLaMA model in C/C++
|
||||||
Version: @LLAMA_INSTALL_VERSION@
|
Version: @PROJECT_VERSION@
|
||||||
Libs: -L${libdir} -lggml -lggml-base -lllama
|
Libs: -L${libdir} -lllama
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
|
@ -1,11 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Windows )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
|
||||||
|
|
||||||
set( arch_c_flags "-march=native" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
|
|
||||||
|
|
|
@ -2,8 +2,6 @@
|
||||||
|
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
llama_add_compile_flags()
|
|
||||||
|
|
||||||
# Build info header
|
# Build info header
|
||||||
#
|
#
|
||||||
|
|
||||||
|
@ -53,28 +51,21 @@ endif()
|
||||||
set(TARGET common)
|
set(TARGET common)
|
||||||
|
|
||||||
add_library(${TARGET} STATIC
|
add_library(${TARGET} STATIC
|
||||||
arg.cpp
|
|
||||||
arg.h
|
|
||||||
base64.hpp
|
base64.hpp
|
||||||
chat.cpp
|
|
||||||
chat.hpp
|
|
||||||
chat-template.hpp
|
|
||||||
common.cpp
|
|
||||||
common.h
|
common.h
|
||||||
console.cpp
|
common.cpp
|
||||||
console.h
|
|
||||||
json-schema-to-grammar.cpp
|
|
||||||
json.hpp
|
|
||||||
llguidance.cpp
|
|
||||||
log.cpp
|
|
||||||
log.h
|
|
||||||
minja.hpp
|
|
||||||
ngram-cache.cpp
|
|
||||||
ngram-cache.h
|
|
||||||
sampling.cpp
|
|
||||||
sampling.h
|
sampling.h
|
||||||
speculative.cpp
|
sampling.cpp
|
||||||
speculative.h
|
console.h
|
||||||
|
console.cpp
|
||||||
|
grammar-parser.h
|
||||||
|
grammar-parser.cpp
|
||||||
|
json.hpp
|
||||||
|
json-schema-to-grammar.cpp
|
||||||
|
train.h
|
||||||
|
train.cpp
|
||||||
|
ngram-cache.h
|
||||||
|
ngram-cache.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
@ -86,39 +77,12 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||||
# Use curl to download model url
|
# Use curl to download model url
|
||||||
if (LLAMA_CURL)
|
if (LLAMA_CURL)
|
||||||
find_package(CURL REQUIRED)
|
find_package(CURL REQUIRED)
|
||||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
add_definitions(-DLLAMA_USE_CURL)
|
||||||
include_directories(${CURL_INCLUDE_DIRS})
|
include_directories(${CURL_INCLUDE_DIRS})
|
||||||
find_library(CURL_LIBRARY curl REQUIRED)
|
find_library(CURL_LIBRARY curl REQUIRED)
|
||||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (LLAMA_LLGUIDANCE)
|
|
||||||
include(ExternalProject)
|
|
||||||
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
|
|
||||||
set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
|
|
||||||
ExternalProject_Add(llguidance_ext
|
|
||||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
||||||
# v0.6.12:
|
|
||||||
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
|
|
||||||
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
||||||
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
||||||
BUILD_IN_SOURCE TRUE
|
|
||||||
CONFIGURE_COMMAND ""
|
|
||||||
BUILD_COMMAND cargo build --release
|
|
||||||
INSTALL_COMMAND ""
|
|
||||||
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
|
|
||||||
UPDATE_COMMAND ""
|
|
||||||
)
|
|
||||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
|
|
||||||
|
|
||||||
add_library(llguidance STATIC IMPORTED)
|
|
||||||
set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
|
|
||||||
add_dependencies(llguidance llguidance_ext)
|
|
||||||
|
|
||||||
target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
|
|
||||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
|
2370
common/arg.cpp
2370
common/arg.cpp
File diff suppressed because it is too large
Load diff
80
common/arg.h
80
common/arg.h
|
@ -1,80 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
//
|
|
||||||
// CLI argument parsing
|
|
||||||
//
|
|
||||||
|
|
||||||
struct common_arg {
|
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
|
||||||
std::set<enum llama_example> excludes = {};
|
|
||||||
std::vector<const char *> args;
|
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
|
||||||
const char * env = nullptr;
|
|
||||||
std::string help;
|
|
||||||
bool is_sparam = false; // is current arg a sampling param?
|
|
||||||
void (*handler_void) (common_params & params) = nullptr;
|
|
||||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
|
||||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
|
||||||
void (*handler_int) (common_params & params, int) = nullptr;
|
|
||||||
|
|
||||||
common_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(common_params & params, const std::string &)
|
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
|
||||||
|
|
||||||
common_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(common_params & params, int)
|
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
|
||||||
|
|
||||||
common_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(common_params & params)
|
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
|
||||||
|
|
||||||
// support 2 values for arg
|
|
||||||
common_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const char * value_hint_2,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(common_params & params, const std::string &, const std::string &)
|
|
||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
|
||||||
|
|
||||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
|
||||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
|
||||||
common_arg & set_env(const char * env);
|
|
||||||
common_arg & set_sparam();
|
|
||||||
bool in_example(enum llama_example ex);
|
|
||||||
bool is_exclude(enum llama_example ex);
|
|
||||||
bool get_value_from_env(std::string & output);
|
|
||||||
bool has_value_from_env();
|
|
||||||
std::string to_string();
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_params_context {
|
|
||||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
|
||||||
common_params & params;
|
|
||||||
std::vector<common_arg> options;
|
|
||||||
void(*print_usage)(int, char **) = nullptr;
|
|
||||||
common_params_context(common_params & params) : params(params) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
// parse input arguments from CLI
|
|
||||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
|
||||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
||||||
|
|
||||||
// function to be used by test-arg-parser
|
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
|
@ -1,529 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright 2024 Google LLC
|
|
||||||
|
|
||||||
Use of this source code is governed by an MIT-style
|
|
||||||
license that can be found in the LICENSE file or at
|
|
||||||
https://opensource.org/licenses/MIT.
|
|
||||||
*/
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "minja.hpp"
|
|
||||||
#include <json.hpp>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
|
||||||
|
|
||||||
namespace minja {
|
|
||||||
|
|
||||||
struct chat_template_caps {
|
|
||||||
bool supports_tools = false;
|
|
||||||
bool supports_tool_calls = false;
|
|
||||||
bool supports_tool_responses = false;
|
|
||||||
bool supports_system_role = false;
|
|
||||||
bool supports_parallel_tool_calls = false;
|
|
||||||
bool supports_tool_call_id = false;
|
|
||||||
// meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object.
|
|
||||||
// Most other templates (and OpenAI's API) expect the arguments object to be stringified.
|
|
||||||
bool requires_object_arguments = false;
|
|
||||||
// CohereForAI/c4ai-command-r-plus simple variant
|
|
||||||
bool requires_non_null_content = false;
|
|
||||||
// MiniMaxAI/MiniMax-Text-01 special
|
|
||||||
bool requires_typed_content = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct chat_template_inputs {
|
|
||||||
nlohmann::ordered_json messages;
|
|
||||||
nlohmann::ordered_json tools;
|
|
||||||
bool add_generation_prompt = true;
|
|
||||||
nlohmann::ordered_json extra_context;
|
|
||||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
||||||
};
|
|
||||||
|
|
||||||
struct chat_template_options {
|
|
||||||
bool apply_polyfills = true;
|
|
||||||
bool use_bos_token = true;
|
|
||||||
bool use_eos_token = true;
|
|
||||||
bool define_strftime_now = true;
|
|
||||||
|
|
||||||
bool polyfill_tools = true;
|
|
||||||
bool polyfill_tool_call_examples = true;
|
|
||||||
bool polyfill_tool_calls = true;
|
|
||||||
bool polyfill_tool_responses = true;
|
|
||||||
bool polyfill_system_role = true;
|
|
||||||
bool polyfill_object_arguments = true;
|
|
||||||
bool polyfill_typed_content = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
class chat_template {
|
|
||||||
|
|
||||||
private:
|
|
||||||
chat_template_caps caps_;
|
|
||||||
std::string source_;
|
|
||||||
std::string bos_token_;
|
|
||||||
std::string eos_token_;
|
|
||||||
std::shared_ptr<minja::TemplateNode> template_root_;
|
|
||||||
std::string tool_call_example_;
|
|
||||||
|
|
||||||
std::string try_raw_render(
|
|
||||||
const nlohmann::ordered_json & messages,
|
|
||||||
const nlohmann::ordered_json & tools,
|
|
||||||
bool add_generation_prompt,
|
|
||||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
chat_template_inputs inputs;
|
|
||||||
inputs.messages = messages;
|
|
||||||
inputs.tools = tools;
|
|
||||||
inputs.add_generation_prompt = add_generation_prompt;
|
|
||||||
inputs.extra_context = extra_context;
|
|
||||||
// Use fixed date for tests
|
|
||||||
inputs.now = std::chrono::system_clock::from_time_t(0);
|
|
||||||
|
|
||||||
chat_template_options opts;
|
|
||||||
opts.apply_polyfills = false;
|
|
||||||
|
|
||||||
auto prompt = apply(inputs, opts);
|
|
||||||
// fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
|
|
||||||
return prompt;
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
// fprintf(stderr, "try_raw_render error: %s\n", e.what());
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
|
|
||||||
: source_(source), bos_token_(bos_token), eos_token_(eos_token)
|
|
||||||
{
|
|
||||||
template_root_ = minja::Parser::parse(source_, {
|
|
||||||
/* .trim_blocks = */ true,
|
|
||||||
/* .lstrip_blocks = */ true,
|
|
||||||
/* .keep_trailing_newline = */ false,
|
|
||||||
});
|
|
||||||
|
|
||||||
auto contains = [](const std::string & haystack, const std::string & needle) {
|
|
||||||
return haystack.find(needle) != std::string::npos;
|
|
||||||
};
|
|
||||||
|
|
||||||
const std::string user_needle = "<User Needle>";
|
|
||||||
const std::string sys_needle = "<System Needle>";
|
|
||||||
const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}};
|
|
||||||
const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}};
|
|
||||||
|
|
||||||
caps_.requires_typed_content =
|
|
||||||
!contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle)
|
|
||||||
&& contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle);
|
|
||||||
|
|
||||||
const auto dummy_user_msg = caps_.requires_typed_content
|
|
||||||
? dummy_typed_user_msg
|
|
||||||
: dummy_str_user_msg;
|
|
||||||
const json needle_system_msg = {
|
|
||||||
{"role", "system"},
|
|
||||||
{"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)},
|
|
||||||
};
|
|
||||||
|
|
||||||
caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle);
|
|
||||||
|
|
||||||
auto out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg
|
|
||||||
}), json::array({
|
|
||||||
{
|
|
||||||
{"name", "some_tool"},
|
|
||||||
{"type", "function"},
|
|
||||||
{"function", {
|
|
||||||
{"name", "some_tool"},
|
|
||||||
{"description", "Some tool."},
|
|
||||||
{"parameters", {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"arg", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"description", "Some argument."},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
{"required", json::array({ "arg" })},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
},
|
|
||||||
}), false);
|
|
||||||
caps_.supports_tools = contains(out, "some_tool");
|
|
||||||
|
|
||||||
auto make_tool_calls_msg = [&](const json & tool_calls) {
|
|
||||||
return json {
|
|
||||||
{"role", "assistant"},
|
|
||||||
{"content", nullptr},
|
|
||||||
{"tool_calls", tool_calls},
|
|
||||||
};
|
|
||||||
};
|
|
||||||
auto make_tool_call = [](const std::string & tool_name, const json & arguments) {
|
|
||||||
return json {
|
|
||||||
{"id", "call_1___"},
|
|
||||||
{"type", "function"},
|
|
||||||
{"function", {
|
|
||||||
{"arguments", arguments},
|
|
||||||
{"name", tool_name},
|
|
||||||
}},
|
|
||||||
};
|
|
||||||
};
|
|
||||||
const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}};
|
|
||||||
|
|
||||||
// Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want.
|
|
||||||
out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg,
|
|
||||||
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
|
|
||||||
}), {}, false);
|
|
||||||
auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
|
|
||||||
out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg,
|
|
||||||
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
|
|
||||||
}), {}, false);
|
|
||||||
auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
|
|
||||||
|
|
||||||
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
|
|
||||||
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
|
|
||||||
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
|
|
||||||
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
|
|
||||||
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
|
|
||||||
|
|
||||||
if (caps_.supports_tool_calls) {
|
|
||||||
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
|
|
||||||
auto tc1 = make_tool_call("test_tool1", dummy_args);
|
|
||||||
auto tc2 = make_tool_call("test_tool2", dummy_args);
|
|
||||||
auto out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg,
|
|
||||||
make_tool_calls_msg(json::array({tc1, tc2})),
|
|
||||||
}), {}, false);
|
|
||||||
caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2");
|
|
||||||
|
|
||||||
out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg,
|
|
||||||
make_tool_calls_msg(json::array({tc1})),
|
|
||||||
{
|
|
||||||
{"role", "tool"},
|
|
||||||
{"name", "test_tool1"},
|
|
||||||
{"content", "Some response!"},
|
|
||||||
{"tool_call_id", "call_911_"},
|
|
||||||
}
|
|
||||||
}), {}, false);
|
|
||||||
caps_.supports_tool_responses = contains(out, "Some response!");
|
|
||||||
caps_.supports_tool_call_id = contains(out, "call_911_");
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (!caps_.supports_tools) {
|
|
||||||
const json user_msg {
|
|
||||||
{"role", "user"},
|
|
||||||
{"content", "Hey"},
|
|
||||||
};
|
|
||||||
const json args {
|
|
||||||
{"arg1", "some_value"},
|
|
||||||
};
|
|
||||||
const json tool_call_msg {
|
|
||||||
{"role", "assistant"},
|
|
||||||
{"content", nullptr},
|
|
||||||
{"tool_calls", json::array({
|
|
||||||
{
|
|
||||||
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
|
|
||||||
{"id", "call_1___"},
|
|
||||||
{"type", "function"},
|
|
||||||
{"function", {
|
|
||||||
{"name", "tool_name"},
|
|
||||||
{"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))},
|
|
||||||
}},
|
|
||||||
},
|
|
||||||
})},
|
|
||||||
};
|
|
||||||
std::string prefix, full;
|
|
||||||
{
|
|
||||||
chat_template_inputs inputs;
|
|
||||||
inputs.messages = json::array({user_msg});
|
|
||||||
inputs.add_generation_prompt = true;
|
|
||||||
prefix = apply(inputs);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
chat_template_inputs inputs;
|
|
||||||
inputs.messages = json::array({user_msg, tool_call_msg});
|
|
||||||
inputs.add_generation_prompt = false;
|
|
||||||
full = apply(inputs);
|
|
||||||
}
|
|
||||||
auto eos_pos_last = full.rfind(eos_token_);
|
|
||||||
if (eos_pos_last == prefix.size() - eos_token_.size() ||
|
|
||||||
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
|
|
||||||
full = full.substr(0, eos_pos_last);
|
|
||||||
}
|
|
||||||
size_t common_prefix_length = 0;
|
|
||||||
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
|
|
||||||
if (prefix[i] != full[i]) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (prefix[i] == '<') {
|
|
||||||
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
|
|
||||||
// but it removes thinking tags for past messages.
|
|
||||||
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
common_prefix_length = i + 1;
|
|
||||||
}
|
|
||||||
auto example = full.substr(common_prefix_length);
|
|
||||||
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
|
|
||||||
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
|
|
||||||
} else {
|
|
||||||
tool_call_example_ = example;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string & source() const { return source_; }
|
|
||||||
const std::string & bos_token() const { return bos_token_; }
|
|
||||||
const std::string & eos_token() const { return eos_token_; }
|
|
||||||
const chat_template_caps & original_caps() const { return caps_; }
|
|
||||||
|
|
||||||
// Deprecated, please use the form with chat_template_inputs and chat_template_options
|
|
||||||
std::string apply(
|
|
||||||
const nlohmann::ordered_json & messages,
|
|
||||||
const nlohmann::ordered_json & tools,
|
|
||||||
bool add_generation_prompt,
|
|
||||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
|
|
||||||
bool apply_polyfills = true)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "[%s] Deprecated!\n", __func__);
|
|
||||||
chat_template_inputs inputs;
|
|
||||||
inputs.messages = messages;
|
|
||||||
inputs.tools = tools;
|
|
||||||
inputs.add_generation_prompt = add_generation_prompt;
|
|
||||||
inputs.extra_context = extra_context;
|
|
||||||
inputs.now = std::chrono::system_clock::now();
|
|
||||||
|
|
||||||
chat_template_options opts;
|
|
||||||
opts.apply_polyfills = apply_polyfills;
|
|
||||||
|
|
||||||
return apply(inputs, opts);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string apply(
|
|
||||||
const chat_template_inputs & inputs,
|
|
||||||
const chat_template_options & opts = chat_template_options()) const
|
|
||||||
{
|
|
||||||
json actual_messages;
|
|
||||||
|
|
||||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
|
||||||
auto has_tool_calls = false;
|
|
||||||
auto has_tool_responses = false;
|
|
||||||
auto has_string_content = false;
|
|
||||||
for (const auto & message : inputs.messages) {
|
|
||||||
if (message.contains("tool_calls") && !message["tool_calls"].is_null()) {
|
|
||||||
has_tool_calls = true;
|
|
||||||
}
|
|
||||||
if (message.contains("role") && message["role"] == "tool") {
|
|
||||||
has_tool_responses = true;
|
|
||||||
}
|
|
||||||
if (message.contains("content") && message["content"].is_string()) {
|
|
||||||
has_string_content = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
|
|
||||||
auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
|
|
||||||
auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
|
|
||||||
auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
|
|
||||||
auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
|
|
||||||
auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
|
|
||||||
auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content;
|
|
||||||
|
|
||||||
auto needs_polyfills = opts.apply_polyfills && (false
|
|
||||||
|| polyfill_system_role
|
|
||||||
|| polyfill_tools
|
|
||||||
|| polyfill_tool_calls
|
|
||||||
|| polyfill_tool_responses
|
|
||||||
|| polyfill_object_arguments
|
|
||||||
|| polyfill_typed_content
|
|
||||||
);
|
|
||||||
|
|
||||||
if (needs_polyfills) {
|
|
||||||
actual_messages = json::array();
|
|
||||||
|
|
||||||
auto add_message = [&](const json & msg) {
|
|
||||||
if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
|
|
||||||
actual_messages.push_back({
|
|
||||||
{"role", msg.at("role")},
|
|
||||||
{"content", {{
|
|
||||||
{"type", "text"},
|
|
||||||
{"text", msg.at("content")},
|
|
||||||
}}},
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
actual_messages.push_back(msg);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string pending_system;
|
|
||||||
auto flush_sys = [&]() {
|
|
||||||
if (!pending_system.empty()) {
|
|
||||||
add_message({
|
|
||||||
{"role", "user"},
|
|
||||||
{"content", pending_system},
|
|
||||||
});
|
|
||||||
pending_system.clear();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
json adjusted_messages;
|
|
||||||
if (polyfill_tools) {
|
|
||||||
adjusted_messages = add_system(inputs.messages,
|
|
||||||
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
|
|
||||||
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
|
|
||||||
} else {
|
|
||||||
adjusted_messages = inputs.messages;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto & message_ : adjusted_messages) {
|
|
||||||
auto message = message_;
|
|
||||||
if (!message.contains("role") || !message.contains("content")) {
|
|
||||||
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
|
|
||||||
}
|
|
||||||
std::string role = message.at("role");
|
|
||||||
|
|
||||||
if (message.contains("tool_calls")) {
|
|
||||||
if (polyfill_object_arguments || polyfill_tool_calls) {
|
|
||||||
for (auto & tool_call : message.at("tool_calls")) {
|
|
||||||
if (tool_call["type"] == "function") {
|
|
||||||
auto & function = tool_call.at("function");
|
|
||||||
auto & arguments = function.at("arguments");
|
|
||||||
if (arguments.is_string()) {
|
|
||||||
try {
|
|
||||||
arguments = json::parse(arguments.get<std::string>());
|
|
||||||
} catch (const std::exception & ecvt) {
|
|
||||||
fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (polyfill_tool_calls) {
|
|
||||||
auto content = message.at("content");
|
|
||||||
auto tool_calls = json::array();
|
|
||||||
for (const auto & tool_call : message.at("tool_calls")) {
|
|
||||||
if (tool_call.at("type") != "function") {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const auto & function = tool_call.at("function");
|
|
||||||
auto tc = json {
|
|
||||||
{"name", function.at("name")},
|
|
||||||
{"arguments", function.at("arguments")},
|
|
||||||
};
|
|
||||||
if (tool_call.contains("id")) {
|
|
||||||
tc["id"] = tool_call["id"];
|
|
||||||
}
|
|
||||||
tool_calls.push_back(tc);
|
|
||||||
}
|
|
||||||
auto obj = json {
|
|
||||||
{"tool_calls", tool_calls},
|
|
||||||
};
|
|
||||||
if (!content.is_null() && content != "") {
|
|
||||||
obj["content"] = content;
|
|
||||||
}
|
|
||||||
message["content"] = obj.dump(2);
|
|
||||||
message.erase("tool_calls");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (polyfill_tool_responses && role == "tool") {
|
|
||||||
message["role"] = "user";
|
|
||||||
auto obj = json {
|
|
||||||
{"tool_response", {
|
|
||||||
{"content", message.at("content")},
|
|
||||||
}},
|
|
||||||
};
|
|
||||||
if (message.contains("name")) {
|
|
||||||
obj["tool_response"]["name"] = message.at("name");
|
|
||||||
}
|
|
||||||
if (message.contains("tool_call_id")) {
|
|
||||||
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
|
|
||||||
}
|
|
||||||
message["content"] = obj.dump(2);
|
|
||||||
message.erase("name");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!message["content"].is_null() && polyfill_system_role) {
|
|
||||||
std::string content = message.at("content");
|
|
||||||
if (role == "system") {
|
|
||||||
if (!pending_system.empty()) pending_system += "\n";
|
|
||||||
pending_system += content;
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
if (role == "user") {
|
|
||||||
if (!pending_system.empty()) {
|
|
||||||
message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
|
|
||||||
pending_system.clear();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
flush_sys();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
add_message(message);
|
|
||||||
}
|
|
||||||
flush_sys();
|
|
||||||
} else {
|
|
||||||
actual_messages = inputs.messages;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto context = minja::Context::make(json({
|
|
||||||
{"messages", actual_messages},
|
|
||||||
{"add_generation_prompt", inputs.add_generation_prompt},
|
|
||||||
}));
|
|
||||||
context->set("bos_token", opts.use_bos_token ? bos_token_ : "");
|
|
||||||
context->set("eos_token", opts.use_eos_token ? eos_token_ : "");
|
|
||||||
if (opts.define_strftime_now) {
|
|
||||||
auto now = inputs.now;
|
|
||||||
context->set("strftime_now", Value::callable([now](const std::shared_ptr<minja::Context> &, minja::ArgumentsValue & args) {
|
|
||||||
args.expectArgs("strftime_now", {1, 1}, {0, 0});
|
|
||||||
auto format = args.args[0].get<std::string>();
|
|
||||||
|
|
||||||
auto time = std::chrono::system_clock::to_time_t(now);
|
|
||||||
auto local_time = *std::localtime(&time);
|
|
||||||
std::ostringstream ss;
|
|
||||||
ss << std::put_time(&local_time, format.c_str());
|
|
||||||
return ss.str();
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
if (!inputs.tools.is_null()) {
|
|
||||||
context->set("tools", minja::Value(inputs.tools));
|
|
||||||
}
|
|
||||||
if (!inputs.extra_context.is_null()) {
|
|
||||||
for (auto & kv : inputs.extra_context.items()) {
|
|
||||||
context->set(kv.key(), minja::Value(kv.value()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto ret = template_root_->render(context);
|
|
||||||
// fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str());
|
|
||||||
// fprintf(stderr, "apply: %s\n\n", ret.c_str());
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
|
|
||||||
json messages_with_system = messages;
|
|
||||||
|
|
||||||
if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
|
|
||||||
std::string existing_system = messages_with_system.at(0).at("content");
|
|
||||||
messages_with_system[0] = json {
|
|
||||||
{"role", "system"},
|
|
||||||
{"content", existing_system + "\n\n" + system_prompt},
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
messages_with_system.insert(messages_with_system.begin(), json {
|
|
||||||
{"role", "system"},
|
|
||||||
{"content", system_prompt},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
return messages_with_system;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace minja
|
|
966
common/chat.cpp
966
common/chat.cpp
|
@ -1,966 +0,0 @@
|
||||||
#include "chat.hpp"
|
|
||||||
#include "chat-template.hpp"
|
|
||||||
#include "json-schema-to-grammar.h"
|
|
||||||
#include "log.h"
|
|
||||||
#include "minja.hpp"
|
|
||||||
|
|
||||||
std::string common_chat_format_name(common_chat_format format) {
|
|
||||||
switch (format) {
|
|
||||||
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
|
|
||||||
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
|
|
||||||
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
|
|
||||||
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
|
|
||||||
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
|
|
||||||
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
|
|
||||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
|
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
||||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
||||||
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
||||||
default:
|
|
||||||
throw std::runtime_error("Unknown chat format");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const common_grammar_options grammar_options {
|
|
||||||
/* .dotall = */ false,
|
|
||||||
/* .compact_spaces = */ false,
|
|
||||||
// /* .compact_spaces = */ true,
|
|
||||||
};
|
|
||||||
|
|
||||||
static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
|
|
||||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
|
||||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
|
||||||
std::size_t position;
|
|
||||||
bool found_error;
|
|
||||||
|
|
||||||
json_error_locator() : position(0), found_error(false) {}
|
|
||||||
|
|
||||||
bool parse_error(std::size_t position, const std::string &, const json::exception &) override {
|
|
||||||
this->position = position - 1;
|
|
||||||
this->found_error = true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
bool null() override { return true; }
|
|
||||||
bool boolean(bool) override { return true; }
|
|
||||||
bool number_integer(number_integer_t) override { return true; }
|
|
||||||
bool number_unsigned(number_unsigned_t) override { return true; }
|
|
||||||
bool number_float(number_float_t, const string_t &) override { return true; }
|
|
||||||
bool string(string_t &) override { return true; }
|
|
||||||
bool binary(binary_t &) override { return true; }
|
|
||||||
bool start_object(std::size_t) override { return true; }
|
|
||||||
bool key(string_t &) override { return true; }
|
|
||||||
bool end_object() override { return true; }
|
|
||||||
bool start_array(std::size_t) override { return true; }
|
|
||||||
bool end_array() override { return true; }
|
|
||||||
};
|
|
||||||
json_error_locator err_loc;
|
|
||||||
json::sax_parse(it, end, &err_loc);
|
|
||||||
|
|
||||||
std::string::const_iterator temptative_end;
|
|
||||||
if (err_loc.found_error) {
|
|
||||||
temptative_end = it + err_loc.position;
|
|
||||||
} else {
|
|
||||||
temptative_end = end;
|
|
||||||
}
|
|
||||||
std::string json_sub {it, temptative_end};
|
|
||||||
try {
|
|
||||||
out = json::parse(json_sub);
|
|
||||||
it = temptative_end;
|
|
||||||
return true;
|
|
||||||
} catch (const std::exception &) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
|
|
||||||
* Aggregates the prefix, suffix and in-between text into the content.
|
|
||||||
*/
|
|
||||||
static common_chat_msg parse_json_tool_calls(
|
|
||||||
const std::string& input,
|
|
||||||
const std::optional<std::regex> & trigger_opt,
|
|
||||||
const std::regex & function_regex,
|
|
||||||
const std::regex & close_regex) {
|
|
||||||
std::smatch match;
|
|
||||||
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
|
|
||||||
|
|
||||||
auto end = input.end();
|
|
||||||
auto it = input.begin();
|
|
||||||
|
|
||||||
if (trigger_opt) {
|
|
||||||
if (!std::regex_search(it, end, match, *trigger_opt)) {
|
|
||||||
result.content = input;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
result.content = match.prefix().str();
|
|
||||||
it = match.suffix().first;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (it != end) {
|
|
||||||
std::sregex_iterator rend;
|
|
||||||
std::sregex_iterator rit(it, end, function_regex);
|
|
||||||
if (rit == rend) {
|
|
||||||
fprintf(stderr, "No more tool calls found\n");
|
|
||||||
result.content += std::string(it, end);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
auto name = rit->str(1);
|
|
||||||
result.content += std::string(it, rit->prefix().second);
|
|
||||||
it = rit->suffix().first;
|
|
||||||
|
|
||||||
json arguments;
|
|
||||||
if (!parse_json(it, end, arguments)) {
|
|
||||||
throw std::runtime_error("Failed to parse json tool call arguments");
|
|
||||||
}
|
|
||||||
if (!std::regex_search(it, end, match, close_regex)) {
|
|
||||||
throw std::runtime_error("Malformed input, missing closing pattern");
|
|
||||||
}
|
|
||||||
it = match.suffix().first;
|
|
||||||
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
|
|
||||||
auto content_end = input.find(prefix);
|
|
||||||
size_t tc_start = std::string::npos;
|
|
||||||
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
const auto process_tool_calls = [&](const json & tool_calls) {
|
|
||||||
for (const auto & tool_call : tool_calls) {
|
|
||||||
const auto & arguments = tool_call["arguments"];
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
tool_call["name"],
|
|
||||||
arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
|
||||||
tool_call.contains("id") ? tool_call["id"] : "",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if (content_end == std::string::npos) {
|
|
||||||
result.content = input;
|
|
||||||
} else {
|
|
||||||
tc_start = content_end + prefix.size() - rstrip_prefix;
|
|
||||||
result.content = input.substr(0, content_end);
|
|
||||||
auto tool_calls = json::parse(input.substr(tc_start));
|
|
||||||
process_tool_calls(tool_calls);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
|
|
||||||
for (const auto & tool : tools) {
|
|
||||||
if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) {
|
|
||||||
LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
fn(tool);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string apply(
|
|
||||||
const common_chat_template & tmpl,
|
|
||||||
const nlohmann::ordered_json & messages,
|
|
||||||
const nlohmann::ordered_json & tools,
|
|
||||||
bool add_generation_prompt,
|
|
||||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
|
|
||||||
{
|
|
||||||
minja::chat_template_inputs tmpl_inputs;
|
|
||||||
tmpl_inputs.messages = messages;
|
|
||||||
tmpl_inputs.tools = tools;
|
|
||||||
tmpl_inputs.add_generation_prompt = add_generation_prompt;
|
|
||||||
tmpl_inputs.extra_context = extra_context;
|
|
||||||
// TODO: add flag to control date/time, if only for testing purposes.
|
|
||||||
// tmpl_inputs.now = std::chrono::system_clock::now();
|
|
||||||
|
|
||||||
minja::chat_template_options tmpl_opts;
|
|
||||||
tmpl_opts.use_bos_token = false;
|
|
||||||
tmpl_opts.use_eos_token = false;
|
|
||||||
|
|
||||||
return tmpl.apply(tmpl_inputs, tmpl_opts);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
|
|
||||||
auto tool_call_schemas = json::array();
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
auto tool_schema = json {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"name", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"const", function["name"]},
|
|
||||||
}},
|
|
||||||
{"arguments", function["parameters"]},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"name", "arguments"})},
|
|
||||||
};
|
|
||||||
if (function.contains("description")) {
|
|
||||||
tool_schema["description"] = function["description"];
|
|
||||||
}
|
|
||||||
if (inputs.parallel_tool_calls) {
|
|
||||||
tool_schema["properties"]["id"] = {
|
|
||||||
{"type", "string"},
|
|
||||||
{"minLength", 4},
|
|
||||||
};
|
|
||||||
tool_schema["required"].push_back("id");
|
|
||||||
}
|
|
||||||
tool_call_schemas.emplace_back(tool_schema);
|
|
||||||
});
|
|
||||||
const auto tool_call =
|
|
||||||
inputs.parallel_tool_calls
|
|
||||||
? json {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"tool_calls", {
|
|
||||||
{"type", "array"},
|
|
||||||
{"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
|
|
||||||
{"anyOf", tool_call_schemas},
|
|
||||||
}},
|
|
||||||
{"minItems", 1},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"tool_calls"})},
|
|
||||||
}
|
|
||||||
: json {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
|
|
||||||
{"anyOf", tool_call_schemas},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"tool_call"})},
|
|
||||||
};
|
|
||||||
const auto schema =
|
|
||||||
inputs.tool_choice != "required"
|
|
||||||
? json {
|
|
||||||
{"anyOf", json::array({
|
|
||||||
tool_call,
|
|
||||||
{
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"response", inputs.json_schema.is_null()
|
|
||||||
? json {{"type", "string"}}
|
|
||||||
: inputs.json_schema
|
|
||||||
},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"response"})},
|
|
||||||
},
|
|
||||||
})}
|
|
||||||
}
|
|
||||||
: tool_call;
|
|
||||||
|
|
||||||
data.grammar_lazy = false;
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
builder.add_schema("root", schema);
|
|
||||||
}, grammar_options);
|
|
||||||
|
|
||||||
auto tweaked_messages = common_chat_template::add_system(
|
|
||||||
inputs.messages,
|
|
||||||
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
|
||||||
|
|
||||||
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_generic(const std::string & input) {
|
|
||||||
json data = json::parse(input);
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
if (data.contains("tool_calls")) {
|
|
||||||
for (const auto & tool_call : data["tool_calls"]) {
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
tool_call["name"],
|
|
||||||
tool_call["arguments"].dump(),
|
|
||||||
tool_call.contains("id") ? tool_call["id"] : "",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else if (data.contains("tool_call")) {
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
data["tool_call"]["name"],
|
|
||||||
data["tool_call"]["arguments"].dump(),
|
|
||||||
/* id= */ "",
|
|
||||||
});
|
|
||||||
} else if (data.contains("response")) {
|
|
||||||
const auto & response = data["response"];
|
|
||||||
result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
auto schemas = json::array();
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
schemas.push_back({
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
// Important note: the model is probably trained to take a JSON stringified arguments value.
|
|
||||||
// It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object.
|
|
||||||
{"name", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"const", function["name"]},
|
|
||||||
}},
|
|
||||||
{"arguments", function["parameters"]},
|
|
||||||
{"id", {
|
|
||||||
{"type", "string"},
|
|
||||||
// Nemo's template expects a 9-character alphanumeric ID.
|
|
||||||
{"pattern", "^[a-zA-Z0-9]{9}$"},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"name", "arguments", "id"})},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
auto schema = json {
|
|
||||||
{"type", "array"},
|
|
||||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
||||||
{"minItems", 1},
|
|
||||||
};
|
|
||||||
if (!inputs.parallel_tool_calls) {
|
|
||||||
schema["maxItems"] = 1;
|
|
||||||
}
|
|
||||||
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
|
|
||||||
}, grammar_options);
|
|
||||||
data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) {
|
|
||||||
return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]");
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
auto schemas = json::array();
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
schemas.push_back({
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"tool_call_id", {
|
|
||||||
{"type", "string"},
|
|
||||||
// Command-R's template expects an integer string.
|
|
||||||
{"pattern", "^[0-9]{1,10}$"},
|
|
||||||
}},
|
|
||||||
{"tool_name", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"const", function["name"]},
|
|
||||||
}},
|
|
||||||
{"parameters", function["parameters"]},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"tool_call_id", "tool_name", "parameters"})},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
auto schema = json {
|
|
||||||
{"type", "array"},
|
|
||||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
||||||
{"minItems", 1},
|
|
||||||
};
|
|
||||||
if (!inputs.parallel_tool_calls) {
|
|
||||||
schema["maxItems"] = 1;
|
|
||||||
}
|
|
||||||
builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
|
|
||||||
}, grammar_options);
|
|
||||||
data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
|
|
||||||
data.preserved_tokens = {
|
|
||||||
"<|START_RESPONSE|>",
|
|
||||||
"<|END_RESPONSE|>",
|
|
||||||
"<|START_THINKING|>",
|
|
||||||
"<|END_THINKING|>",
|
|
||||||
"<|END_ACTION|>",
|
|
||||||
};
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
|
|
||||||
static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
|
|
||||||
static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
|
|
||||||
std::smatch match;
|
|
||||||
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
if (std::regex_match(input, match, response_regex)) {
|
|
||||||
result.content = match[1].str();
|
|
||||||
} else if (std::regex_match(input, match, thought_action_regex)) {
|
|
||||||
result.tool_plan = match[1].str();
|
|
||||||
auto actions_str = match[2].str();
|
|
||||||
auto actions = json::parse(actions_str);
|
|
||||||
for (const auto & action : actions) {
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
/* .name = */ action["tool_name"],
|
|
||||||
/* .arguments = */ action["parameters"].dump(),
|
|
||||||
/* .id = */ action["tool_call_id"],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOG_ERR("Failed to parse command_r output");
|
|
||||||
result.content = input;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
|
|
||||||
if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
|
|
||||||
throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties");
|
|
||||||
}
|
|
||||||
const auto & parameters_properties = parameters.at("properties");
|
|
||||||
const auto & parameters_required = parameters.at("required");
|
|
||||||
for (const auto & prop : expected_properties) {
|
|
||||||
if (!parameters_properties.contains(prop)) {
|
|
||||||
throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop);
|
|
||||||
}
|
|
||||||
if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) {
|
|
||||||
throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (parameters_properties.size() != expected_properties.size()) {
|
|
||||||
throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", "));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct common_chat_inputs & inputs, bool allow_python_tag_builtin_tools) {
|
|
||||||
auto builtin_tools = json::array();
|
|
||||||
common_chat_params data;
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> tool_rules;
|
|
||||||
|
|
||||||
auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
|
|
||||||
if (name == "wolfram_alpha") {
|
|
||||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
|
|
||||||
expect_tool_parameters(name, parameters, {"query"});
|
|
||||||
} else if (name == "web_search" || name == "brave_search") {
|
|
||||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
|
|
||||||
expect_tool_parameters(name, parameters, {"query"});
|
|
||||||
} else if (name == "python" || name == "code_interpreter") {
|
|
||||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
|
|
||||||
expect_tool_parameters(name, parameters, {"code"});
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> kvs;
|
|
||||||
for (const auto & [key, value] : parameters.at("properties").items()) {
|
|
||||||
kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value));
|
|
||||||
}
|
|
||||||
|
|
||||||
tool_rules.push_back(
|
|
||||||
builder.add_rule(
|
|
||||||
name + "-call",
|
|
||||||
"\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
|
|
||||||
builtin_tools.push_back(name);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
auto parameters = function["parameters"];
|
|
||||||
builder.resolve_refs(parameters);
|
|
||||||
|
|
||||||
// https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
|
|
||||||
if (allow_python_tag_builtin_tools) {
|
|
||||||
handle_builtin_tool(name, parameters);
|
|
||||||
}
|
|
||||||
tool_rules.push_back(
|
|
||||||
builder.add_rule(
|
|
||||||
name + "-call",
|
|
||||||
"\"{\" space "
|
|
||||||
"( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
|
|
||||||
"\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
|
|
||||||
builder.add_schema(name + "-args", parameters) +
|
|
||||||
" \"}\""));
|
|
||||||
data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
|
|
||||||
});
|
|
||||||
data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
|
||||||
if (!builtin_tools.empty()) {
|
|
||||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
|
||||||
}
|
|
||||||
builder.add_rule("root", string_join(tool_rules, " | "));
|
|
||||||
}, grammar_options);
|
|
||||||
data.additional_stops.push_back("<|eom_id|>");
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
|
|
||||||
{"tools_in_user_message", false},
|
|
||||||
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
|
||||||
});
|
|
||||||
data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
|
|
||||||
? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
|
|
||||||
: COMMON_CHAT_FORMAT_LLAMA_3_X;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
|
|
||||||
// TODO: tighten & simplify the parser, don't accept leading text context.
|
|
||||||
static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
|
|
||||||
static std::regex close_regex("\\}");
|
|
||||||
static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
|
|
||||||
|
|
||||||
if (with_builtin_tools) {
|
|
||||||
std::smatch match;
|
|
||||||
if (std::regex_match(input, match, builtin_call_regex)) {
|
|
||||||
auto name = match[1].str();
|
|
||||||
auto raw_args = match[2].str();
|
|
||||||
|
|
||||||
// TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
|
|
||||||
auto it_eq = raw_args.find('=');
|
|
||||||
auto arg_name = raw_args.substr(0, it_eq);
|
|
||||||
auto arg_value_str = raw_args.substr(it_eq + 1);
|
|
||||||
auto arg_value = json::parse(arg_value_str);
|
|
||||||
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ match.prefix().str(),
|
|
||||||
/* .tool_calls = */ {
|
|
||||||
{
|
|
||||||
/* .name = */ match[1],
|
|
||||||
/* .arguments = */ (json {
|
|
||||||
{arg_name, arg_value},
|
|
||||||
}).dump(),
|
|
||||||
/* .id = */ "",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> tool_rules;
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
auto parameters = function["parameters"];
|
|
||||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
|
||||||
tool_rules.push_back(builder.add_rule(name + "-call",
|
|
||||||
"\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\""));
|
|
||||||
});
|
|
||||||
data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false});
|
|
||||||
data.preserved_tokens = {
|
|
||||||
"<|tool▁sep|>",
|
|
||||||
"<|tool▁call▁end|>",
|
|
||||||
};
|
|
||||||
builder.add_rule("root", "\"<|tool▁calls▁begin|>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space");
|
|
||||||
}, grammar_options);
|
|
||||||
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.prompt = prompt;
|
|
||||||
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) {
|
|
||||||
static std::regex trigger_regex("<|tool▁calls▁begin|>");
|
|
||||||
static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
|
|
||||||
static std::regex close_regex("```<|tool▁call▁end|>");
|
|
||||||
return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
fprintf(stderr, "%s\n", __func__);
|
|
||||||
common_chat_params data;
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
|
|
||||||
{"datetime", "Jan 29 2025 13:00:00 GMT"},
|
|
||||||
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
|
||||||
});
|
|
||||||
if (!inputs.tools.is_null() && !inputs.tools.empty()) {
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
auto schemas = json::array();
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
schemas.push_back({
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"name", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"const", function["name"]},
|
|
||||||
}},
|
|
||||||
{"arguments", function["parameters"]},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"name", "arguments", "id"})},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
auto schema = json {
|
|
||||||
{"type", "array"},
|
|
||||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
||||||
{"minItems", 1},
|
|
||||||
};
|
|
||||||
if (!inputs.parallel_tool_calls) {
|
|
||||||
schema["maxItems"] = 1;
|
|
||||||
}
|
|
||||||
builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
|
|
||||||
}, grammar_options);
|
|
||||||
data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
|
|
||||||
data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
|
|
||||||
} else {
|
|
||||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
||||||
}
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) {
|
|
||||||
return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
// >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
|
|
||||||
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
|
|
||||||
common_chat_params data;
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
|
|
||||||
if (!inputs.tools.is_null() && !inputs.tools.empty()) {
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> first_tool_rules;
|
|
||||||
std::vector<std::string> subsequent_tool_rules;
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
auto parameters = function["parameters"];
|
|
||||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
|
||||||
first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
|
|
||||||
subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
|
|
||||||
data.grammar_triggers.push_back({name, /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
|
|
||||||
});
|
|
||||||
auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
|
|
||||||
if (inputs.parallel_tool_calls) {
|
|
||||||
auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
|
|
||||||
builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*");
|
|
||||||
} else {
|
|
||||||
builder.add_rule("root", first_rule);
|
|
||||||
}
|
|
||||||
|
|
||||||
}, grammar_options);
|
|
||||||
}
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
|
|
||||||
auto expected_it = expected.begin();
|
|
||||||
auto tmp_it = it;
|
|
||||||
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
|
|
||||||
++tmp_it;
|
|
||||||
++expected_it;
|
|
||||||
}
|
|
||||||
if (expected_it == expected.end()) {
|
|
||||||
it = tmp_it;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
|
|
||||||
static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
|
|
||||||
static std::regex close_regex(R"($|(?=>>>))");
|
|
||||||
|
|
||||||
std::string content;
|
|
||||||
auto it = input.begin();
|
|
||||||
const auto end = input.end();
|
|
||||||
|
|
||||||
if (consume(it, end, "all\n")) {
|
|
||||||
std::smatch match;
|
|
||||||
if (std::regex_search(it, end, match, function_regex)) {
|
|
||||||
auto fun_it = match.prefix().second;
|
|
||||||
content = std::string(it, fun_it);
|
|
||||||
it = fun_it;
|
|
||||||
} else {
|
|
||||||
common_chat_msg res;
|
|
||||||
res.role = "assistant";
|
|
||||||
res.content = std::string(it, end);
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// TODO: tighten & simplify.
|
|
||||||
try {
|
|
||||||
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
|
|
||||||
res.content = content + res.content;
|
|
||||||
return res;
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
|
|
||||||
common_chat_msg res;
|
|
||||||
res.role = "assistant";
|
|
||||||
res.content = input;
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
// https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
|
|
||||||
common_chat_params data;
|
|
||||||
json tools = inputs.tools.is_null() ? inputs.tools : json::array();
|
|
||||||
std::string python_code_argument_name;
|
|
||||||
auto has_raw_python = false;
|
|
||||||
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> tool_rules;
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
const auto & parameters = function["parameters"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
if (name == "python" || name == "ipython") {
|
|
||||||
if (!parameters.contains("type")) {
|
|
||||||
throw std::runtime_error("Missing type in python tool");
|
|
||||||
}
|
|
||||||
has_raw_python = true;
|
|
||||||
auto type = parameters.at("type");
|
|
||||||
if (type == "object") {
|
|
||||||
auto properties = parameters.at("properties");
|
|
||||||
for (auto it = properties.begin(); it != properties.end(); ++it) {
|
|
||||||
if (it.value().at("type") == "string") {
|
|
||||||
if (!python_code_argument_name.empty()) {
|
|
||||||
throw std::runtime_error("Multiple string arguments found in python tool");
|
|
||||||
}
|
|
||||||
python_code_argument_name = it.key();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (python_code_argument_name.empty()) {
|
|
||||||
throw std::runtime_error("No string argument found in python tool");
|
|
||||||
}
|
|
||||||
} else if (type != "string") {
|
|
||||||
throw std::runtime_error("Invalid type in python tool: " + type.dump());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
|
|
||||||
});
|
|
||||||
if (has_raw_python) {
|
|
||||||
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
|
|
||||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
|
||||||
}
|
|
||||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
|
|
||||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
|
||||||
data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
|
|
||||||
}, grammar_options);
|
|
||||||
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
// TODO: if (has_raw_python)
|
|
||||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
|
|
||||||
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
|
|
||||||
static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
|
|
||||||
std::smatch match;
|
|
||||||
if (std::regex_search(input, match, python_tag_regex)) {
|
|
||||||
auto code = match[1].str();
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ match.prefix().str(),
|
|
||||||
/* .tool_calls = */ {
|
|
||||||
{
|
|
||||||
/* .name = */ "python",
|
|
||||||
/* .arguments = */ (json {{"code", code}}).dump(),
|
|
||||||
/* .id = */ "",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
static std::regex function_regex(R"(<function=(\w+)>)");
|
|
||||||
static std::regex close_regex(R"(</function>)");
|
|
||||||
// TODO: tighten & simplify.
|
|
||||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> tool_rules;
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
auto parameters = function["parameters"];
|
|
||||||
builder.resolve_refs(parameters);
|
|
||||||
tool_rules.push_back(builder.add_schema(name + "-call", {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", json {
|
|
||||||
{"name", json {{"const", name}}},
|
|
||||||
{"arguments", parameters},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"name", "arguments"})},
|
|
||||||
}));
|
|
||||||
});
|
|
||||||
auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
|
|
||||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
|
||||||
data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
|
|
||||||
data.preserved_tokens = { "</tool_call>" };
|
|
||||||
}, grammar_options);
|
|
||||||
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
|
|
||||||
try {
|
|
||||||
std::regex start_pattern(R"([\n\s]*<tool_call>)");
|
|
||||||
std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
|
|
||||||
std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
|
|
||||||
|
|
||||||
auto end = input.end();
|
|
||||||
std::sregex_iterator rend;
|
|
||||||
std::sregex_iterator rit(input.begin(), end, start_pattern);
|
|
||||||
if (rit == rend) {
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ input,
|
|
||||||
/* .tool_calls = */ {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
result.content = rit->prefix();
|
|
||||||
|
|
||||||
auto it = rit->suffix().first;
|
|
||||||
while (it != end) {
|
|
||||||
json call;
|
|
||||||
if (!parse_json(it, end, call)) {
|
|
||||||
throw std::runtime_error("Failed to parse json tool call");
|
|
||||||
}
|
|
||||||
const auto & arguments = call["arguments"];
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
call["name"],
|
|
||||||
arguments.dump(),
|
|
||||||
// arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
|
||||||
/* id= */ "",
|
|
||||||
});
|
|
||||||
rit = {it, end, middle_pattern};
|
|
||||||
if (rit != rend) {
|
|
||||||
it = rit->suffix().first;
|
|
||||||
} else {
|
|
||||||
rit = {it, end, end_pattern};
|
|
||||||
if (rit == rend) {
|
|
||||||
throw std::runtime_error("Malformed input, missing </tool_call>");
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ input,
|
|
||||||
/* .tool_calls = */ {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
||||||
data.grammar_lazy = false;
|
|
||||||
if (!inputs.json_schema.is_null()) {
|
|
||||||
if (!inputs.grammar.empty()) {
|
|
||||||
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
|
||||||
}
|
|
||||||
data.grammar = json_schema_to_grammar(inputs.json_schema);
|
|
||||||
} else {
|
|
||||||
data.grammar = inputs.grammar.empty();
|
|
||||||
}
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none";
|
|
||||||
LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false");
|
|
||||||
|
|
||||||
if (has_tools && !inputs.grammar.empty()) {
|
|
||||||
throw std::runtime_error("Cannot specify grammar with tools");
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto & src = tmpl.source();
|
|
||||||
if (src.find(">>>all") != std::string::npos) {
|
|
||||||
// Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when
|
|
||||||
return common_chat_params_init_functionary_v3_2(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find(" functools[") != std::string::npos) {
|
|
||||||
// Firefunction v2 requires datetime and functions in the context, even w/o tools.
|
|
||||||
return common_chat_params_init_firefunction_v2(tmpl, inputs);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!has_tools) {
|
|
||||||
return common_chat_params_init_without_tools(tmpl, inputs);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (src.find("<tool_call>") != std::string::npos) {
|
|
||||||
return common_chat_params_init_hermes_2_pro(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find("<|start_header_id|>") != std::string::npos
|
|
||||||
&& src.find("<function=") != std::string::npos) {
|
|
||||||
return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
|
|
||||||
auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
|
|
||||||
return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools);
|
|
||||||
}
|
|
||||||
if (src.find("<|tool▁calls▁begin|>") != std::string::npos) {
|
|
||||||
return common_chat_params_init_deepseek_r1(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find("[TOOL_CALLS]") != std::string::npos) {
|
|
||||||
return common_chat_params_init_mistral_nemo(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) {
|
|
||||||
return common_chat_params_init_command_r7b(tmpl, inputs);
|
|
||||||
}
|
|
||||||
return common_chat_params_init_generic(tmpl, inputs);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_msg common_chat_parse_content_only(const std::string & input) {
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ input,
|
|
||||||
/* .tool_calls = */ {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) {
|
|
||||||
switch (format) {
|
|
||||||
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
|
|
||||||
return common_chat_parse_content_only(input);
|
|
||||||
case COMMON_CHAT_FORMAT_GENERIC:
|
|
||||||
return common_chat_parse_generic(input);
|
|
||||||
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
|
|
||||||
return common_chat_parse_mistral_nemo(input);
|
|
||||||
case COMMON_CHAT_FORMAT_LLAMA_3_X:
|
|
||||||
return common_chat_parse_llama_3_1(input);
|
|
||||||
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
|
|
||||||
return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
|
|
||||||
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
|
|
||||||
return common_chat_parse_deepseek_r1(input);
|
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
|
|
||||||
return common_chat_parse_functionary_v3_2(input);
|
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
|
|
||||||
return common_chat_parse_functionary_v3_1_llama_3_1(input);
|
|
||||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
|
|
||||||
return common_chat_parse_hermes_2_pro(input);
|
|
||||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
|
|
||||||
return common_chat_parse_firefunction_v2(input);
|
|
||||||
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
|
||||||
return common_chat_parse_command_r7b(input);
|
|
||||||
default:
|
|
||||||
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,52 +0,0 @@
|
||||||
// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include <json.hpp>
|
|
||||||
#include <optional>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
|
||||||
|
|
||||||
struct common_chat_inputs {
|
|
||||||
json messages;
|
|
||||||
json tools;
|
|
||||||
json tool_choice;
|
|
||||||
json json_schema;
|
|
||||||
bool parallel_tool_calls;
|
|
||||||
bool stream;
|
|
||||||
std::string grammar;
|
|
||||||
bool add_generation_prompt = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
enum common_chat_format {
|
|
||||||
COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
|
||||||
COMMON_CHAT_FORMAT_GENERIC,
|
|
||||||
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
|
|
||||||
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
|
||||||
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
|
||||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
|
||||||
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
|
||||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
|
||||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
||||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
||||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
||||||
|
|
||||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_chat_params {
|
|
||||||
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
||||||
json prompt;
|
|
||||||
std::string grammar;
|
|
||||||
bool grammar_lazy = false;
|
|
||||||
std::vector<common_grammar_trigger> grammar_triggers;
|
|
||||||
std::vector<std::string> preserved_tokens;
|
|
||||||
std::vector<std::string> additional_stops;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
|
|
||||||
std::string common_chat_format_name(common_chat_format format);
|
|
||||||
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
|
|
2874
common/common.cpp
2874
common/common.cpp
File diff suppressed because it is too large
Load diff
512
common/common.h
512
common/common.h
|
@ -2,12 +2,20 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama-cpp.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <set>
|
#include "sampling.h"
|
||||||
|
|
||||||
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <random>
|
||||||
|
#include <thread>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <tuple>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
|
@ -25,192 +33,52 @@
|
||||||
|
|
||||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||||
|
|
||||||
struct common_adapter_lora_info {
|
|
||||||
std::string path;
|
|
||||||
float scale;
|
|
||||||
|
|
||||||
struct llama_adapter_lora * ptr;
|
|
||||||
};
|
|
||||||
|
|
||||||
using llama_tokens = std::vector<llama_token>;
|
|
||||||
|
|
||||||
// build info
|
// build info
|
||||||
extern int LLAMA_BUILD_NUMBER;
|
extern int LLAMA_BUILD_NUMBER;
|
||||||
extern const char * LLAMA_COMMIT;
|
extern char const * LLAMA_COMMIT;
|
||||||
extern const char * LLAMA_COMPILER;
|
extern char const * LLAMA_COMPILER;
|
||||||
extern const char * LLAMA_BUILD_TARGET;
|
extern char const * LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
struct common_control_vector_load_info;
|
struct llama_control_vector_load_info;
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU utils
|
// CPU utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct cpu_params {
|
|
||||||
int n_threads = -1;
|
|
||||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
|
||||||
bool mask_valid = false; // Default: any CPU
|
|
||||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
|
||||||
bool strict_cpu = false; // Use strict CPU placement
|
|
||||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
|
||||||
};
|
|
||||||
|
|
||||||
int32_t cpu_get_num_physical_cores();
|
int32_t cpu_get_num_physical_cores();
|
||||||
int32_t cpu_get_num_math();
|
int32_t cpu_get_num_math();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Common params
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llama_example {
|
|
||||||
LLAMA_EXAMPLE_COMMON,
|
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
|
||||||
LLAMA_EXAMPLE_MAIN,
|
|
||||||
LLAMA_EXAMPLE_INFILL,
|
|
||||||
LLAMA_EXAMPLE_EMBEDDING,
|
|
||||||
LLAMA_EXAMPLE_PERPLEXITY,
|
|
||||||
LLAMA_EXAMPLE_RETRIEVAL,
|
|
||||||
LLAMA_EXAMPLE_PASSKEY,
|
|
||||||
LLAMA_EXAMPLE_IMATRIX,
|
|
||||||
LLAMA_EXAMPLE_BENCH,
|
|
||||||
LLAMA_EXAMPLE_SERVER,
|
|
||||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
|
||||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
|
||||||
LLAMA_EXAMPLE_LLAVA,
|
|
||||||
LLAMA_EXAMPLE_LOOKUP,
|
|
||||||
LLAMA_EXAMPLE_PARALLEL,
|
|
||||||
LLAMA_EXAMPLE_TTS,
|
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum common_sampler_type {
|
|
||||||
COMMON_SAMPLER_TYPE_NONE = 0,
|
|
||||||
COMMON_SAMPLER_TYPE_DRY = 1,
|
|
||||||
COMMON_SAMPLER_TYPE_TOP_K = 2,
|
|
||||||
COMMON_SAMPLER_TYPE_TOP_P = 3,
|
|
||||||
COMMON_SAMPLER_TYPE_MIN_P = 4,
|
|
||||||
//COMMON_SAMPLER_TYPE_TFS_Z = 5,
|
|
||||||
COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
|
|
||||||
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
|
|
||||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
|
||||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
|
||||||
};
|
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
enum dimre_method {
|
enum dimre_method {
|
||||||
DIMRE_METHOD_PCA,
|
DIMRE_METHOD_PCA,
|
||||||
DIMRE_METHOD_MEAN,
|
DIMRE_METHOD_MEAN,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum common_conversation_mode {
|
struct gpt_params {
|
||||||
COMMON_CONVERSATION_MODE_DISABLED = 0,
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
COMMON_CONVERSATION_MODE_ENABLED = 1,
|
|
||||||
COMMON_CONVERSATION_MODE_AUTO = 2,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_grammar_trigger {
|
int32_t n_threads = cpu_get_num_math();
|
||||||
std::string word;
|
int32_t n_threads_draft = -1;
|
||||||
bool at_start;
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
};
|
int32_t n_threads_batch_draft = -1;
|
||||||
|
|
||||||
// sampling parameters
|
|
||||||
struct common_params_sampling {
|
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
||||||
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
|
||||||
float xtc_probability = 0.00f; // 0.0 = disabled
|
|
||||||
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
|
||||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
|
||||||
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
|
||||||
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
|
||||||
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
|
||||||
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
bool ignore_eos = false;
|
|
||||||
bool no_perf = false; // disable performance metrics
|
|
||||||
bool timing_per_token = false;
|
|
||||||
|
|
||||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
|
||||||
|
|
||||||
|
|
||||||
std::vector<enum common_sampler_type> samplers = {
|
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
|
||||||
COMMON_SAMPLER_TYPE_DRY,
|
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
|
||||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
|
||||||
COMMON_SAMPLER_TYPE_TOP_P,
|
|
||||||
COMMON_SAMPLER_TYPE_MIN_P,
|
|
||||||
COMMON_SAMPLER_TYPE_XTC,
|
|
||||||
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
||||||
bool grammar_lazy = false;
|
|
||||||
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
|
|
||||||
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
|
|
||||||
std::set<llama_token> preserved_tokens;
|
|
||||||
|
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
||||||
|
|
||||||
// print the parameters into a string
|
|
||||||
std::string print() const;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_params_speculative {
|
|
||||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
||||||
|
|
||||||
int32_t n_ctx = 0; // draft context size
|
|
||||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
|
||||||
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
|
||||||
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
|
||||||
|
|
||||||
struct cpu_params cpuparams;
|
|
||||||
struct cpu_params cpuparams_batch;
|
|
||||||
|
|
||||||
std::string hf_repo = ""; // HF repo // NOLINT
|
|
||||||
std::string hf_file = ""; // HF file // NOLINT
|
|
||||||
|
|
||||||
std::string model = ""; // draft model for speculative decoding // NOLINT
|
|
||||||
std::string model_url = ""; // model url to download // NOLINT
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_params_vocoder {
|
|
||||||
std::string hf_repo = ""; // HF repo // NOLINT
|
|
||||||
std::string hf_file = ""; // HF file // NOLINT
|
|
||||||
|
|
||||||
std::string model = ""; // model path // NOLINT
|
|
||||||
std::string model_url = ""; // model url to download // NOLINT
|
|
||||||
|
|
||||||
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_params {
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 4096; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t grp_attn_n = 1; // group-attention factor
|
int32_t grp_attn_n = 1; // group-attention factor
|
||||||
int32_t grp_attn_w = 512; // group-attention width
|
int32_t grp_attn_w = 512; // group-attention width
|
||||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||||
|
@ -221,56 +89,47 @@ struct common_params {
|
||||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
// offload params
|
|
||||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
||||||
|
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
||||||
|
|
||||||
struct cpu_params cpuparams;
|
|
||||||
struct cpu_params cpuparams_batch;
|
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
struct common_params_sampling sampling;
|
// // sampling parameters
|
||||||
struct common_params_speculative speculative;
|
struct llama_sampling_params sparams;
|
||||||
struct common_params_vocoder vocoder;
|
|
||||||
|
|
||||||
std::string model = ""; // model path // NOLINT
|
std::string model = ""; // model path
|
||||||
std::string model_alias = ""; // model alias // NOLINT
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_url = ""; // model url to download // NOLINT
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string hf_token = ""; // HF token // NOLINT
|
std::string model_url = ""; // model url to download
|
||||||
std::string hf_repo = ""; // HF repo // NOLINT
|
std::string hf_token = ""; // HF token
|
||||||
std::string hf_file = ""; // HF file // NOLINT
|
std::string hf_repo = ""; // HF repo
|
||||||
std::string prompt = ""; // NOLINT
|
std::string hf_file = ""; // HF file
|
||||||
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
std::string prompt = "";
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
std::string prompt_file = ""; // store the external prompt file name
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
std::string input_suffix = ""; // string to suffix user inputs with
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
std::string logdir = ""; // directory in which to save YAML log files
|
||||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||||
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||||
|
std::string logits_file = ""; // file for saving *all* logits
|
||||||
|
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||||
|
|
||||||
std::vector<std::string> in_files; // all input files
|
std::vector<std::string> in_files; // all input files
|
||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
std::vector<llama_model_kv_override> kv_overrides;
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
// TODO: avoid tuple, use struct
|
||||||
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||||
|
|
||||||
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
int32_t verbosity = 0;
|
int32_t verbosity = 0;
|
||||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
|
@ -296,6 +155,7 @@ struct common_params {
|
||||||
bool special = false; // enable special token output
|
bool special = false; // enable special token output
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
bool interactive_first = false; // wait for user input immediately
|
bool interactive_first = false; // wait for user input immediately
|
||||||
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
|
@ -304,58 +164,51 @@ struct common_params {
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool flash_attn = false; // flash attention
|
bool flash_attn = false; // flash attention
|
||||||
bool no_perf = false; // disable performance metrics
|
|
||||||
bool ctx_shift = true; // context shift on inifinite text generation
|
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
|
bool ignore_eos = false; // ignore generated EOS tokens
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool display_prompt = true; // print prompt before generation
|
bool display_prompt = true; // print prompt before generation
|
||||||
|
bool infill = false; // use infill mode
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
bool check_tensors = false; // validate tensor data
|
bool check_tensors = false; // validate tensor data
|
||||||
|
|
||||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
||||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
// embedding
|
// embedding
|
||||||
bool embedding = false; // get only sentence embedding
|
bool embedding = false; // get only sentence embedding
|
||||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
std::string embd_sep = "\n"; // separator of embeddings
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
bool reranking = false; // enable reranking support on server
|
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
||||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = "";
|
||||||
std::string chat_template = ""; // NOLINT
|
std::string chat_template = "";
|
||||||
bool use_jinja = false; // NOLINT
|
std::string system_prompt = "";
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
std::string ssl_file_key = ""; // NOLINT
|
std::string ssl_file_key = "";
|
||||||
std::string ssl_file_cert = ""; // NOLINT
|
std::string ssl_file_cert = "";
|
||||||
|
|
||||||
// "advanced" endpoints are disabled by default for better security
|
bool endpoint_slots = true;
|
||||||
bool webui = true;
|
|
||||||
bool endpoint_slots = false;
|
|
||||||
bool endpoint_props = false; // only control POST requests, not GET
|
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
@ -403,51 +256,29 @@ struct common_params {
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
|
|
||||||
// batched-bench params
|
|
||||||
bool batched_bench_output_jsonl = false;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// call once at the start of a program if it uses libcommon
|
void gpt_params_handle_hf_token(gpt_params & params);
|
||||||
// initializes the logging system and prints info about the build
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
void common_init();
|
|
||||||
|
|
||||||
std::string common_params_get_system_info(const common_params & params);
|
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||||
|
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
||||||
|
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
||||||
|
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
|
|
||||||
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
||||||
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
|
|
||||||
bool set_process_priority(enum ggml_sched_priority prio);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
#ifdef __GNUC__
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
#ifdef __MINGW32__
|
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
||||||
#else
|
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
||||||
std::string string_format(const char * fmt, ...);
|
|
||||||
|
|
||||||
std::string string_strip(const std::string & str);
|
std::string string_strip(const std::string & str);
|
||||||
std::string string_get_sortable_timestamp();
|
std::string string_get_sortable_timestamp();
|
||||||
|
|
||||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
|
|
||||||
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
|
|
||||||
std::string string_repeat(const std::string & str, size_t n);
|
|
||||||
|
|
||||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
static std::vector<T> string_split(const std::string & str, char delim) {
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||||
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
|
|
||||||
std::vector<T> values;
|
std::vector<T> values;
|
||||||
std::istringstream str_stream(str);
|
std::istringstream str_stream(str);
|
||||||
std::string token;
|
std::string token;
|
||||||
|
@ -460,40 +291,9 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
||||||
return values;
|
return values;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
|
||||||
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
|
|
||||||
{
|
|
||||||
std::vector<std::string> parts;
|
|
||||||
size_t begin_pos = 0;
|
|
||||||
size_t separator_pos = input.find(separator);
|
|
||||||
while (separator_pos != std::string::npos) {
|
|
||||||
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
|
|
||||||
parts.emplace_back(part);
|
|
||||||
begin_pos = separator_pos + 1;
|
|
||||||
separator_pos = input.find(separator, begin_pos);
|
|
||||||
}
|
|
||||||
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
|
|
||||||
return parts;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool string_starts_with(const std::string & str,
|
|
||||||
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
|
|
||||||
return str.rfind(prefix, 0) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool string_ends_with(const std::string & str,
|
|
||||||
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
|
|
||||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
void string_process_escapes(std::string & input);
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
std::string string_from(bool value);
|
|
||||||
std::string string_from(const std::vector<int> & values);
|
|
||||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
|
||||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Filesystem utils
|
// Filesystem utils
|
||||||
//
|
//
|
||||||
|
@ -508,193 +308,125 @@ std::string fs_get_cache_file(const std::string & filename);
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// note: defines object's lifetime
|
// TODO: avoid tuplue, use struct
|
||||||
struct common_init_result {
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
|
||||||
llama_model_ptr model;
|
|
||||||
llama_context_ptr context;
|
|
||||||
|
|
||||||
std::vector<llama_adapter_lora_ptr> lora;
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
};
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params);
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
|
||||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
|
||||||
const std::string & model_url,
|
|
||||||
const std::string & local_path,
|
|
||||||
const std::string & hf_token,
|
|
||||||
const struct llama_model_params & params);
|
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
|
||||||
const std::string & repo,
|
|
||||||
const std::string & remote_path,
|
|
||||||
const std::string & local_path,
|
|
||||||
const std::string & hf_token,
|
|
||||||
const struct llama_model_params & params);
|
|
||||||
|
|
||||||
std::pair<std::string, std::string> common_get_hf_file(
|
|
||||||
const std::string & hf_repo_with_tag,
|
|
||||||
const std::string & hf_token);
|
|
||||||
|
|
||||||
// clear LoRA adapters from context, then apply new list of adapters
|
|
||||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Batch utils
|
// Batch utils
|
||||||
//
|
|
||||||
|
|
||||||
void common_batch_clear(struct llama_batch & batch);
|
void llama_batch_clear(struct llama_batch & batch);
|
||||||
|
|
||||||
void common_batch_add(
|
void llama_batch_add(
|
||||||
struct llama_batch & batch,
|
struct llama_batch & batch,
|
||||||
llama_token id,
|
llama_token id,
|
||||||
llama_pos pos,
|
llama_pos pos,
|
||||||
const std::vector<llama_seq_id> & seq_ids,
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
bool logits);
|
bool logits);
|
||||||
|
|
||||||
//
|
|
||||||
// Token utils
|
|
||||||
//
|
|
||||||
|
|
||||||
// longest common prefix
|
|
||||||
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
|
|
||||||
|
|
||||||
// longet common subsequence
|
|
||||||
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// tokenizes a string into a vector of tokens
|
// tokenizes a string into a vector of tokens
|
||||||
// should work similar to Python's `tokenizer.encode`
|
// should work similar to Python's `tokenizer.encode`
|
||||||
std::vector<llama_token> common_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
std::vector<llama_token> common_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
const struct llama_vocab * vocab,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
// tokenizes a token into a piece, optionally renders special/control tokens
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||||
// should work similar to Python's `tokenizer.id_to_piece`
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
std::string common_token_to_piece(
|
std::string llama_token_to_piece(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
bool special = true);
|
bool special = true);
|
||||||
|
|
||||||
std::string common_token_to_piece(
|
|
||||||
const struct llama_vocab * vocab,
|
|
||||||
llama_token token,
|
|
||||||
bool special = true);
|
|
||||||
|
|
||||||
// detokenizes a vector of tokens into a string
|
// detokenizes a vector of tokens into a string
|
||||||
// should work similar to Python's `tokenizer.decode`
|
// should work similar to Python's `tokenizer.decode`
|
||||||
// optionally renders special/control tokens
|
// optionally renders special/control tokens
|
||||||
std::string common_detokenize(
|
std::string llama_detokenize(
|
||||||
const struct llama_context * ctx,
|
llama_context * ctx,
|
||||||
const std::vector<llama_token> & tokens,
|
const std::vector<llama_token> & tokens,
|
||||||
bool special = true);
|
bool special = true);
|
||||||
|
|
||||||
std::string common_detokenize(
|
// Uses the value from the model metadata if possible, otherwise
|
||||||
const struct llama_vocab * vocab,
|
// defaults to true when model type is SPM, otherwise false.
|
||||||
const std::vector<llama_token> & tokens,
|
bool llama_should_add_bos_token(const llama_model * model);
|
||||||
bool special = true);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct common_tool_call {
|
|
||||||
std::string name;
|
|
||||||
std::string arguments;
|
|
||||||
std::string id;
|
|
||||||
};
|
|
||||||
|
|
||||||
// same with llama_chat_message, but uses std::string
|
// same with llama_chat_message, but uses std::string
|
||||||
struct common_chat_msg {
|
struct llama_chat_msg {
|
||||||
std::string role;
|
std::string role;
|
||||||
std::string content;
|
std::string content;
|
||||||
std::vector<common_tool_call> tool_calls;
|
|
||||||
std::string tool_plan = "";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
bool llama_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
namespace minja {
|
|
||||||
class chat_template;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef minja::chat_template common_chat_template;
|
|
||||||
|
|
||||||
struct common_chat_templates {
|
|
||||||
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
||||||
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
||||||
std::unique_ptr<common_chat_template> template_tool_use;
|
|
||||||
};
|
|
||||||
|
|
||||||
// CPP wrapper for llama_chat_apply_template
|
// CPP wrapper for llama_chat_apply_template
|
||||||
// If the built-in template is not supported, we default to chatml
|
// If the built-in template is not supported, we default to chatml
|
||||||
// If the custom "tmpl" is not supported, we throw an error
|
// If the custom "tmpl" is not supported, we throw an error
|
||||||
std::string common_chat_apply_template(
|
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
const common_chat_template & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<common_chat_msg> & chat,
|
const std::vector<llama_chat_msg> & chat,
|
||||||
bool add_ass,
|
bool add_ass);
|
||||||
bool use_jinja);
|
|
||||||
|
|
||||||
// Format single message, while taking into account the position of that message in chat history
|
// Format single message, while taking into account the position of that message in chat history
|
||||||
std::string common_chat_format_single(
|
std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
const common_chat_template & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<common_chat_msg> & past_msg,
|
const std::vector<llama_chat_msg> & past_msg,
|
||||||
const common_chat_msg & new_msg,
|
const llama_chat_msg & new_msg,
|
||||||
bool add_ass,
|
bool add_ass);
|
||||||
bool use_jinja);
|
|
||||||
|
|
||||||
// Returns an example of formatted chat
|
// Returns an example of formatted chat
|
||||||
std::string common_chat_format_example(
|
std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
const common_chat_template & tmpl, bool use_jinja);
|
const std::string & tmpl);
|
||||||
|
|
||||||
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// Dump the KV cache view with the number of sequences per cell.
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// TODO: repace embd_norm with an enum
|
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
|
|
||||||
|
|
||||||
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Control vector utils
|
// Control vector utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct common_control_vector_data {
|
struct llama_control_vector_data {
|
||||||
int n_embd;
|
int n_embd;
|
||||||
|
|
||||||
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
||||||
std::vector<float> data;
|
std::vector<float> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_control_vector_load_info {
|
struct llama_control_vector_load_info {
|
||||||
float strength;
|
float strength;
|
||||||
|
|
||||||
std::string fname;
|
std::string fname;
|
||||||
|
@ -702,16 +434,24 @@ struct common_control_vector_load_info {
|
||||||
|
|
||||||
// Load control vectors, scale each by strength, and add them together.
|
// Load control vectors, scale each by strength, and add them together.
|
||||||
// On error, returns {-1, empty}
|
// On error, returns {-1, empty}
|
||||||
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
|
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Split utils
|
// Split utils
|
||||||
//
|
//
|
||||||
|
|
||||||
namespace {
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||||
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
const char * const LLM_KV_SPLIT_NO = "split.no";
|
//
|
||||||
const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
// YAML utils
|
||||||
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
//
|
||||||
|
|
||||||
}
|
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
||||||
|
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
||||||
|
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||||
|
|
||||||
|
void yaml_dump_non_result_info(
|
||||||
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
|
@ -94,9 +94,6 @@ namespace console {
|
||||||
simple_io = true;
|
simple_io = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (simple_io) {
|
|
||||||
_setmode(_fileno(stdin), _O_U8TEXT);
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
// POSIX-specific console initialization
|
// POSIX-specific console initialization
|
||||||
if (!simple_io) {
|
if (!simple_io) {
|
||||||
|
|
536
common/grammar-parser.cpp
Normal file
536
common/grammar-parser.cpp
Normal file
|
@ -0,0 +1,536 @@
|
||||||
|
#include "grammar-parser.h"
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cwchar>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <exception>
|
||||||
|
|
||||||
|
namespace grammar_parser {
|
||||||
|
// NOTE: assumes valid utf8 (but checks for overrun)
|
||||||
|
// copied from llama.cpp
|
||||||
|
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
||||||
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||||
|
uint8_t first_byte = static_cast<uint8_t>(*src);
|
||||||
|
uint8_t highbits = first_byte >> 4;
|
||||||
|
int len = lookup[highbits];
|
||||||
|
uint8_t mask = (1 << (8 - len)) - 1;
|
||||||
|
uint32_t value = first_byte & mask;
|
||||||
|
const char * end = src + len; // may overrun!
|
||||||
|
const char * pos = src + 1;
|
||||||
|
for ( ; pos < end && *pos; pos++) {
|
||||||
|
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
||||||
|
}
|
||||||
|
return std::make_pair(value, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||||
|
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||||
|
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
||||||
|
return result.first->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
||||||
|
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||||
|
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
|
||||||
|
return next_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void add_rule(
|
||||||
|
parse_state & state,
|
||||||
|
uint32_t rule_id,
|
||||||
|
const std::vector<llama_grammar_element> & rule) {
|
||||||
|
if (state.rules.size() <= rule_id) {
|
||||||
|
state.rules.resize(rule_id + 1);
|
||||||
|
}
|
||||||
|
state.rules[rule_id] = rule;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_digit_char(char c) {
|
||||||
|
return '0' <= c && c <= '9';
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_word_char(char c) {
|
||||||
|
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||||
|
const char * pos = src;
|
||||||
|
const char * end = src + size;
|
||||||
|
uint32_t value = 0;
|
||||||
|
for ( ; pos < end && *pos; pos++) {
|
||||||
|
value <<= 4;
|
||||||
|
char c = *pos;
|
||||||
|
if ('a' <= c && c <= 'f') {
|
||||||
|
value += c - 'a' + 10;
|
||||||
|
} else if ('A' <= c && c <= 'F') {
|
||||||
|
value += c - 'A' + 10;
|
||||||
|
} else if ('0' <= c && c <= '9') {
|
||||||
|
value += c - '0';
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (pos != end) {
|
||||||
|
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
|
||||||
|
}
|
||||||
|
return std::make_pair(value, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * parse_space(const char * src, bool newline_ok) {
|
||||||
|
const char * pos = src;
|
||||||
|
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
|
||||||
|
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
|
||||||
|
if (*pos == '#') {
|
||||||
|
while (*pos && *pos != '\r' && *pos != '\n') {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * parse_name(const char * src) {
|
||||||
|
const char * pos = src;
|
||||||
|
while (is_word_char(*pos)) {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
if (pos == src) {
|
||||||
|
throw std::runtime_error(std::string("expecting name at ") + src);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * parse_int(const char * src) {
|
||||||
|
const char * pos = src;
|
||||||
|
while (is_digit_char(*pos)) {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
if (pos == src) {
|
||||||
|
throw std::runtime_error(std::string("expecting integer at ") + src);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||||
|
if (*src == '\\') {
|
||||||
|
switch (src[1]) {
|
||||||
|
case 'x': return parse_hex(src + 2, 2);
|
||||||
|
case 'u': return parse_hex(src + 2, 4);
|
||||||
|
case 'U': return parse_hex(src + 2, 8);
|
||||||
|
case 't': return std::make_pair('\t', src + 2);
|
||||||
|
case 'r': return std::make_pair('\r', src + 2);
|
||||||
|
case 'n': return std::make_pair('\n', src + 2);
|
||||||
|
case '\\':
|
||||||
|
case '"':
|
||||||
|
case '[':
|
||||||
|
case ']':
|
||||||
|
return std::make_pair(src[1], src + 2);
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(std::string("unknown escape at ") + src);
|
||||||
|
}
|
||||||
|
} else if (*src) {
|
||||||
|
return decode_utf8(src);
|
||||||
|
}
|
||||||
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * parse_alternates(
|
||||||
|
parse_state & state,
|
||||||
|
const char * src,
|
||||||
|
const std::string & rule_name,
|
||||||
|
uint32_t rule_id,
|
||||||
|
bool is_nested);
|
||||||
|
|
||||||
|
static const char * parse_sequence(
|
||||||
|
parse_state & state,
|
||||||
|
const char * src,
|
||||||
|
const std::string & rule_name,
|
||||||
|
std::vector<llama_grammar_element> & out_elements,
|
||||||
|
bool is_nested) {
|
||||||
|
size_t last_sym_start = out_elements.size();
|
||||||
|
const char * pos = src;
|
||||||
|
|
||||||
|
auto handle_repetitions = [&](int min_times, int max_times) {
|
||||||
|
|
||||||
|
if (last_sym_start == out_elements.size()) {
|
||||||
|
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||||
|
// the following rewrite rules:
|
||||||
|
// S{m,n} --> S S S (m times) S'(n-m)
|
||||||
|
// S'(x) ::= S S'(x-1) |
|
||||||
|
// (... n-m definitions of these S' rules ...)
|
||||||
|
// S'(1) ::= S |
|
||||||
|
// S{m,} --> S S S (m times) S'
|
||||||
|
// S' ::= S S' |
|
||||||
|
// S* --> S{0,}
|
||||||
|
// --> S' ::= S S' |
|
||||||
|
// S+ --> S{1,}
|
||||||
|
// --> S S'
|
||||||
|
// S' ::= S S' |
|
||||||
|
// S? --> S{0,1}
|
||||||
|
// --> S'
|
||||||
|
// S' ::= S |
|
||||||
|
|
||||||
|
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
|
||||||
|
if (min_times == 0) {
|
||||||
|
out_elements.resize(last_sym_start);
|
||||||
|
} else {
|
||||||
|
// Repeat the previous elements (min_times - 1) times
|
||||||
|
for (int i = 1; i < min_times; i++) {
|
||||||
|
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t last_rec_rule_id = 0;
|
||||||
|
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
||||||
|
|
||||||
|
std::vector<llama_grammar_element> rec_rule(previous_elements);
|
||||||
|
for (int i = 0; i < n_opt; i++) {
|
||||||
|
rec_rule.resize(previous_elements.size());
|
||||||
|
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
|
||||||
|
if (i > 0 || max_times < 0) {
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
||||||
|
}
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||||
|
add_rule(state, rec_rule_id, rec_rule);
|
||||||
|
last_rec_rule_id = rec_rule_id;
|
||||||
|
}
|
||||||
|
if (n_opt > 0) {
|
||||||
|
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
while (*pos) {
|
||||||
|
if (*pos == '"') { // literal string
|
||||||
|
pos++;
|
||||||
|
last_sym_start = out_elements.size();
|
||||||
|
while (*pos != '"') {
|
||||||
|
if (!*pos) {
|
||||||
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
}
|
||||||
|
auto char_pair = parse_char(pos);
|
||||||
|
pos = char_pair.second;
|
||||||
|
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
||||||
|
}
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == '[') { // char range(s)
|
||||||
|
pos++;
|
||||||
|
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
|
||||||
|
if (*pos == '^') {
|
||||||
|
pos++;
|
||||||
|
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
||||||
|
}
|
||||||
|
last_sym_start = out_elements.size();
|
||||||
|
while (*pos != ']') {
|
||||||
|
if (!*pos) {
|
||||||
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
}
|
||||||
|
auto char_pair = parse_char(pos);
|
||||||
|
pos = char_pair.second;
|
||||||
|
enum llama_gretype type = last_sym_start < out_elements.size()
|
||||||
|
? LLAMA_GRETYPE_CHAR_ALT
|
||||||
|
: start_type;
|
||||||
|
|
||||||
|
out_elements.push_back({type, char_pair.first});
|
||||||
|
if (pos[0] == '-' && pos[1] != ']') {
|
||||||
|
if (!pos[1]) {
|
||||||
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
}
|
||||||
|
auto endchar_pair = parse_char(pos + 1);
|
||||||
|
pos = endchar_pair.second;
|
||||||
|
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (is_word_char(*pos)) { // rule reference
|
||||||
|
const char * name_end = parse_name(pos);
|
||||||
|
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
|
||||||
|
pos = parse_space(name_end, is_nested);
|
||||||
|
last_sym_start = out_elements.size();
|
||||||
|
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
||||||
|
} else if (*pos == '(') { // grouping
|
||||||
|
// parse nested alternates into synthesized rule
|
||||||
|
pos = parse_space(pos + 1, true);
|
||||||
|
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
||||||
|
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
|
||||||
|
last_sym_start = out_elements.size();
|
||||||
|
// output reference to synthesized rule
|
||||||
|
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
||||||
|
if (*pos != ')') {
|
||||||
|
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
||||||
|
}
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == '.') { // any char
|
||||||
|
last_sym_start = out_elements.size();
|
||||||
|
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == '*') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(0, -1);
|
||||||
|
} else if (*pos == '+') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(1, -1);
|
||||||
|
} else if (*pos == '?') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(0, 1);
|
||||||
|
} else if (*pos == '{') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
|
||||||
|
if (!is_digit_char(*pos)) {
|
||||||
|
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||||
|
}
|
||||||
|
const char * int_end = parse_int(pos);
|
||||||
|
int min_times = std::stoul(std::string(pos, int_end - pos));
|
||||||
|
pos = parse_space(int_end, is_nested);
|
||||||
|
|
||||||
|
int max_times = -1;
|
||||||
|
|
||||||
|
if (*pos == '}') {
|
||||||
|
max_times = min_times;
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == ',') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
|
||||||
|
if (is_digit_char(*pos)) {
|
||||||
|
const char * int_end = parse_int(pos);
|
||||||
|
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||||
|
pos = parse_space(int_end, is_nested);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*pos != '}') {
|
||||||
|
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
||||||
|
}
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
||||||
|
}
|
||||||
|
handle_repetitions(min_times, max_times);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * parse_alternates(
|
||||||
|
parse_state & state,
|
||||||
|
const char * src,
|
||||||
|
const std::string & rule_name,
|
||||||
|
uint32_t rule_id,
|
||||||
|
bool is_nested) {
|
||||||
|
std::vector<llama_grammar_element> rule;
|
||||||
|
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
|
||||||
|
while (*pos == '|') {
|
||||||
|
rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||||
|
pos = parse_space(pos + 1, true);
|
||||||
|
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
|
||||||
|
}
|
||||||
|
rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||||
|
add_rule(state, rule_id, rule);
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * parse_rule(parse_state & state, const char * src) {
|
||||||
|
const char * name_end = parse_name(src);
|
||||||
|
const char * pos = parse_space(name_end, false);
|
||||||
|
size_t name_len = name_end - src;
|
||||||
|
uint32_t rule_id = get_symbol_id(state, src, name_len);
|
||||||
|
const std::string name(src, name_len);
|
||||||
|
|
||||||
|
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
|
||||||
|
throw std::runtime_error(std::string("expecting ::= at ") + pos);
|
||||||
|
}
|
||||||
|
pos = parse_space(pos + 3, true);
|
||||||
|
|
||||||
|
pos = parse_alternates(state, pos, name, rule_id, false);
|
||||||
|
|
||||||
|
if (*pos == '\r') {
|
||||||
|
pos += pos[1] == '\n' ? 2 : 1;
|
||||||
|
} else if (*pos == '\n') {
|
||||||
|
pos++;
|
||||||
|
} else if (*pos) {
|
||||||
|
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
|
||||||
|
}
|
||||||
|
return parse_space(pos, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
parse_state parse(const char * src) {
|
||||||
|
try {
|
||||||
|
parse_state state;
|
||||||
|
const char * pos = parse_space(src, true);
|
||||||
|
while (*pos) {
|
||||||
|
pos = parse_rule(state, pos);
|
||||||
|
}
|
||||||
|
// Validate the state to ensure that all rules are defined
|
||||||
|
for (const auto & rule : state.rules) {
|
||||||
|
for (const auto & elem : rule) {
|
||||||
|
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
||||||
|
// Ensure that the rule at that location exists
|
||||||
|
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
|
||||||
|
// Get the name of the rule that is missing
|
||||||
|
for (const auto & kv : state.symbol_ids) {
|
||||||
|
if (kv.second == elem.value) {
|
||||||
|
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return state;
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
|
||||||
|
return parse_state();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_grammar_char(FILE * file, uint32_t c) {
|
||||||
|
if (0x20 <= c && c <= 0x7f) {
|
||||||
|
fprintf(file, "%c", static_cast<char>(c));
|
||||||
|
} else {
|
||||||
|
// cop out of encoding UTF-8
|
||||||
|
fprintf(file, "<U+%04X>", c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_char_element(llama_grammar_element elem) {
|
||||||
|
switch (elem.type) {
|
||||||
|
case LLAMA_GRETYPE_CHAR: return true;
|
||||||
|
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
||||||
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY: return true;
|
||||||
|
default: return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
||||||
|
for (auto elem : rule) {
|
||||||
|
switch (elem.type) {
|
||||||
|
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
|
||||||
|
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
|
||||||
|
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
|
||||||
|
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
||||||
|
}
|
||||||
|
switch (elem.type) {
|
||||||
|
case LLAMA_GRETYPE_END:
|
||||||
|
case LLAMA_GRETYPE_ALT:
|
||||||
|
case LLAMA_GRETYPE_RULE_REF:
|
||||||
|
fprintf(file, "(%u) ", elem.value);
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_CHAR:
|
||||||
|
case LLAMA_GRETYPE_CHAR_NOT:
|
||||||
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||||
|
case LLAMA_GRETYPE_CHAR_ALT:
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
|
fprintf(file, "(\"");
|
||||||
|
print_grammar_char(file, elem.value);
|
||||||
|
fprintf(file, "\") ");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fprintf(file, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_rule(
|
||||||
|
FILE * file,
|
||||||
|
uint32_t rule_id,
|
||||||
|
const std::vector<llama_grammar_element> & rule,
|
||||||
|
const std::map<uint32_t, std::string> & symbol_id_names) {
|
||||||
|
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
|
||||||
|
}
|
||||||
|
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
|
||||||
|
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
|
||||||
|
llama_grammar_element elem = rule[i];
|
||||||
|
switch (elem.type) {
|
||||||
|
case LLAMA_GRETYPE_END:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"unexpected end of rule: " + std::to_string(rule_id) + "," +
|
||||||
|
std::to_string(i));
|
||||||
|
case LLAMA_GRETYPE_ALT:
|
||||||
|
fprintf(file, "| ");
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_RULE_REF:
|
||||||
|
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_CHAR:
|
||||||
|
fprintf(file, "[");
|
||||||
|
print_grammar_char(file, elem.value);
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_NOT:
|
||||||
|
fprintf(file, "[^");
|
||||||
|
print_grammar_char(file, elem.value);
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||||
|
if (i == 0 || !is_char_element(rule[i - 1])) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
|
||||||
|
std::to_string(rule_id) + "," + std::to_string(i));
|
||||||
|
}
|
||||||
|
fprintf(file, "-");
|
||||||
|
print_grammar_char(file, elem.value);
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ALT:
|
||||||
|
if (i == 0 || !is_char_element(rule[i - 1])) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
|
||||||
|
std::to_string(rule_id) + "," + std::to_string(i));
|
||||||
|
}
|
||||||
|
print_grammar_char(file, elem.value);
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
|
fprintf(file, ".");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (is_char_element(elem)) {
|
||||||
|
switch (rule[i + 1].type) {
|
||||||
|
case LLAMA_GRETYPE_CHAR_ALT:
|
||||||
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fprintf(file, "] ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fprintf(file, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_grammar(FILE * file, const parse_state & state) {
|
||||||
|
try {
|
||||||
|
std::map<uint32_t, std::string> symbol_id_names;
|
||||||
|
for (const auto & kv : state.symbol_ids) {
|
||||||
|
symbol_id_names[kv.second] = kv.first;
|
||||||
|
}
|
||||||
|
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
||||||
|
// fprintf(file, "%zu: ", i);
|
||||||
|
// print_rule_binary(file, state.rules[i]);
|
||||||
|
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
|
||||||
|
// fprintf(file, "\n");
|
||||||
|
}
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<const llama_grammar_element *> parse_state::c_rules() {
|
||||||
|
std::vector<const llama_grammar_element *> ret;
|
||||||
|
ret.reserve(rules.size());
|
||||||
|
for (const auto & rule : rules) {
|
||||||
|
ret.push_back(rule.data());
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
29
common/grammar-parser.h
Normal file
29
common/grammar-parser.h
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
// Implements a parser for an extended Backus-Naur form (BNF), producing the
|
||||||
|
// binary context-free grammar format specified by llama.h. Supports character
|
||||||
|
// ranges, grouping, and repetition operators. As an example, a grammar for
|
||||||
|
// arithmetic might look like:
|
||||||
|
//
|
||||||
|
// root ::= expr
|
||||||
|
// expr ::= term ([-+*/] term)*
|
||||||
|
// term ::= num | "(" space expr ")" space
|
||||||
|
// num ::= [0-9]+ space
|
||||||
|
// space ::= [ \t\n]*
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include "llama.h"
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace grammar_parser {
|
||||||
|
struct parse_state {
|
||||||
|
std::map<std::string, uint32_t> symbol_ids;
|
||||||
|
std::vector<std::vector<llama_grammar_element>> rules;
|
||||||
|
|
||||||
|
std::vector<const llama_grammar_element *> c_rules();
|
||||||
|
};
|
||||||
|
|
||||||
|
parse_state parse(const char * src);
|
||||||
|
void print_grammar(FILE * file, const parse_state & state);
|
||||||
|
}
|
|
@ -1,6 +1,4 @@
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
@ -13,6 +11,11 @@
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
template <typename Iterator>
|
||||||
|
static std::string join(Iterator begin, Iterator end, const std::string & separator);
|
||||||
|
|
||||||
|
static std::string repeat(const std::string & str, size_t n);
|
||||||
|
|
||||||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||||
auto has_max = max_items != std::numeric_limits<int>::max();
|
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||||
|
|
||||||
|
@ -125,8 +128,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
||||||
if (sub_len > 0) {
|
if (sub_len > 0) {
|
||||||
auto from_sub = from.substr(i + 1);
|
auto from_sub = from.substr(i + 1);
|
||||||
auto to_sub = to.substr(i + 1);
|
auto to_sub = to.substr(i + 1);
|
||||||
auto sub_zeros = string_repeat("0", sub_len);
|
auto sub_zeros = repeat("0", sub_len);
|
||||||
auto sub_nines = string_repeat("9", sub_len);
|
auto sub_nines = repeat("9", sub_len);
|
||||||
|
|
||||||
auto to_reached = false;
|
auto to_reached = false;
|
||||||
out << "(";
|
out << "(";
|
||||||
|
@ -185,8 +188,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
||||||
auto max_digits = max_s.length();
|
auto max_digits = max_s.length();
|
||||||
|
|
||||||
for (auto digits = min_digits; digits < max_digits; digits++) {
|
for (auto digits = min_digits; digits < max_digits; digits++) {
|
||||||
uniform_range(min_s, string_repeat("9", digits));
|
uniform_range(min_s, repeat("9", digits));
|
||||||
min_s = "1" + string_repeat("0", digits);
|
min_s = "1" + repeat("0", digits);
|
||||||
out << " | ";
|
out << " | ";
|
||||||
}
|
}
|
||||||
uniform_range(min_s, max_s);
|
uniform_range(min_s, max_s);
|
||||||
|
@ -315,6 +318,49 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
||||||
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||||
|
|
||||||
|
template <typename Iterator>
|
||||||
|
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
||||||
|
std::ostringstream result;
|
||||||
|
if (begin != end) {
|
||||||
|
result << *begin;
|
||||||
|
for (Iterator it = begin + 1; it != end; ++it) {
|
||||||
|
result << separator << *it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
|
||||||
|
std::vector<std::string> tokens;
|
||||||
|
size_t start = 0;
|
||||||
|
size_t end = str.find(delimiter);
|
||||||
|
|
||||||
|
while (end != std::string::npos) {
|
||||||
|
tokens.push_back(str.substr(start, end - start));
|
||||||
|
start = end + delimiter.length();
|
||||||
|
end = str.find(delimiter, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
tokens.push_back(str.substr(start));
|
||||||
|
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string repeat(const std::string & str, size_t n) {
|
||||||
|
if (n == 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
result.reserve(str.length() * n);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
result += str;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
|
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
|
||||||
std::smatch match;
|
std::smatch match;
|
||||||
std::string result;
|
std::string result;
|
||||||
|
@ -343,7 +389,6 @@ static std::string format_literal(const std::string & literal) {
|
||||||
|
|
||||||
class SchemaConverter {
|
class SchemaConverter {
|
||||||
private:
|
private:
|
||||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
|
||||||
std::function<json(const std::string &)> _fetch_json;
|
std::function<json(const std::string &)> _fetch_json;
|
||||||
bool _dotall;
|
bool _dotall;
|
||||||
std::map<std::string, std::string> _rules;
|
std::map<std::string, std::string> _rules;
|
||||||
|
@ -373,7 +418,7 @@ private:
|
||||||
for (size_t i = 0; i < alt_schemas.size(); i++) {
|
for (size_t i = 0; i < alt_schemas.size(); i++) {
|
||||||
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
|
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
|
||||||
}
|
}
|
||||||
return string_join(rules, " | ");
|
return join(rules.begin(), rules.end(), " | ");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
|
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
|
||||||
|
@ -436,7 +481,7 @@ private:
|
||||||
for (const auto & item : ret) {
|
for (const auto & item : ret) {
|
||||||
results.push_back(to_rule(item));
|
results.push_back(to_rule(item));
|
||||||
}
|
}
|
||||||
return std::make_pair(string_join(results, " "), false);
|
return std::make_pair(join(results.begin(), results.end(), " "), false);
|
||||||
};
|
};
|
||||||
|
|
||||||
while (i < length) {
|
while (i < length) {
|
||||||
|
@ -494,7 +539,7 @@ private:
|
||||||
}
|
}
|
||||||
curly_brackets += '}';
|
curly_brackets += '}';
|
||||||
i++;
|
i++;
|
||||||
auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
|
auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
|
||||||
int min_times = 0;
|
int min_times = 0;
|
||||||
int max_times = std::numeric_limits<int>::max();
|
int max_times = std::numeric_limits<int>::max();
|
||||||
try {
|
try {
|
||||||
|
@ -566,7 +611,7 @@ private:
|
||||||
}
|
}
|
||||||
return join_seq();
|
return join_seq();
|
||||||
};
|
};
|
||||||
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
|
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -764,11 +809,10 @@ private:
|
||||||
public:
|
public:
|
||||||
SchemaConverter(
|
SchemaConverter(
|
||||||
const std::function<json(const std::string &)> & fetch_json,
|
const std::function<json(const std::string &)> & fetch_json,
|
||||||
bool dotall,
|
bool dotall)
|
||||||
bool compact_spaces)
|
|
||||||
: _fetch_json(fetch_json), _dotall(dotall)
|
: _fetch_json(fetch_json), _dotall(dotall)
|
||||||
{
|
{
|
||||||
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
|
_rules["space"] = SPACE_RULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
void resolve_refs(json & schema, const std::string & url) {
|
void resolve_refs(json & schema, const std::string & url) {
|
||||||
|
@ -810,7 +854,7 @@ public:
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
std::string pointer = ref.substr(ref.find('#') + 1);
|
std::string pointer = ref.substr(ref.find('#') + 1);
|
||||||
std::vector<std::string> tokens = string_split(pointer, "/");
|
std::vector<std::string> tokens = split(pointer, "/");
|
||||||
for (size_t i = 1; i < tokens.size(); ++i) {
|
for (size_t i = 1; i < tokens.size(); ++i) {
|
||||||
std::string sel = tokens[i];
|
std::string sel = tokens[i];
|
||||||
if (target.is_null() || !target.contains(sel)) {
|
if (target.is_null() || !target.contains(sel)) {
|
||||||
|
@ -861,7 +905,7 @@ public:
|
||||||
for (const auto & v : schema["enum"]) {
|
for (const auto & v : schema["enum"]) {
|
||||||
enum_values.push_back(_generate_constant_rule(v));
|
enum_values.push_back(_generate_constant_rule(v));
|
||||||
}
|
}
|
||||||
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
|
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
|
||||||
} else if ((schema_type.is_null() || schema_type == "object")
|
} else if ((schema_type.is_null() || schema_type == "object")
|
||||||
&& (schema.contains("properties") ||
|
&& (schema.contains("properties") ||
|
||||||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
||||||
|
@ -975,10 +1019,10 @@ public:
|
||||||
|
|
||||||
void check_errors() {
|
void check_errors() {
|
||||||
if (!_errors.empty()) {
|
if (!_errors.empty()) {
|
||||||
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
|
||||||
}
|
}
|
||||||
if (!_warnings.empty()) {
|
if (!_warnings.empty()) {
|
||||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -991,35 +1035,11 @@ public:
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
std::string json_schema_to_grammar(const json & schema) {
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
|
||||||
if (!force_gbnf) {
|
|
||||||
return "%llguidance {}\nstart: %json " + schema.dump();
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
(void)force_gbnf;
|
|
||||||
#endif // LLAMA_USE_LLGUIDANCE
|
|
||||||
return build_grammar([&](const common_grammar_builder & callbacks) {
|
|
||||||
auto copy = schema;
|
auto copy = schema;
|
||||||
callbacks.resolve_refs(copy);
|
converter.resolve_refs(copy, "input");
|
||||||
callbacks.add_schema("", copy);
|
converter.visit(copy, "");
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
|
||||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
|
|
||||||
common_grammar_builder builder {
|
|
||||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
|
||||||
return converter._add_rule(name, rule);
|
|
||||||
},
|
|
||||||
/* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
|
|
||||||
return converter.visit(schema, name == "root" ? "" : name);
|
|
||||||
},
|
|
||||||
/* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
|
|
||||||
converter.resolve_refs(schema, "");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
cb(builder);
|
|
||||||
converter.check_errors();
|
converter.check_errors();
|
||||||
return converter.format_grammar();
|
return converter.format_grammar();
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,18 +5,4 @@
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
|
||||||
bool force_gbnf = false);
|
|
||||||
|
|
||||||
struct common_grammar_builder {
|
|
||||||
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
|
||||||
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
|
||||||
std::function<void(nlohmann::ordered_json &)> resolve_refs;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_grammar_options {
|
|
||||||
bool dotall = false;
|
|
||||||
bool compact_spaces = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
|
|
||||||
|
|
|
@ -1,270 +0,0 @@
|
||||||
#include "sampling.h"
|
|
||||||
#include "log.h"
|
|
||||||
|
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
|
||||||
|
|
||||||
# include "llguidance.h"
|
|
||||||
# include <cmath>
|
|
||||||
|
|
||||||
struct llama_sampler_llg {
|
|
||||||
const llama_vocab * vocab;
|
|
||||||
std::string grammar_kind;
|
|
||||||
std::string grammar_data;
|
|
||||||
LlgTokenizer * tokenizer;
|
|
||||||
LlgConstraint * grammar;
|
|
||||||
LlgMaskResult llg_res;
|
|
||||||
bool has_llg_res;
|
|
||||||
};
|
|
||||||
|
|
||||||
static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
|
|
||||||
const char * grammar_data) {
|
|
||||||
LlgConstraintInit cinit;
|
|
||||||
llg_constraint_init_set_defaults(&cinit, tokenizer);
|
|
||||||
const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
|
|
||||||
if (log_level && *log_level) {
|
|
||||||
cinit.log_stderr_level = atoi(log_level);
|
|
||||||
}
|
|
||||||
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
|
|
||||||
if (llg_get_error(c)) {
|
|
||||||
LOG_ERR("llg error: %s\n", llg_get_error(c));
|
|
||||||
llg_free_constraint(c);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
|
|
||||||
return "llguidance";
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
|
|
||||||
auto * ctx = (llama_sampler_llg *) smpl->ctx;
|
|
||||||
if (ctx->grammar) {
|
|
||||||
LlgCommitResult res;
|
|
||||||
llg_commit_token(ctx->grammar, token, &res);
|
|
||||||
ctx->has_llg_res = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
||||||
auto * ctx = (llama_sampler_llg *) smpl->ctx;
|
|
||||||
if (ctx->grammar) {
|
|
||||||
if (!ctx->has_llg_res) {
|
|
||||||
if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
|
|
||||||
ctx->has_llg_res = true;
|
|
||||||
} else {
|
|
||||||
LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
|
|
||||||
llg_free_constraint(ctx->grammar);
|
|
||||||
ctx->grammar = nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (ctx->has_llg_res) {
|
|
||||||
if (ctx->llg_res.is_stop) {
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
||||||
if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
|
|
||||||
cur_p->data[i].logit = -INFINITY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
const uint32_t * mask = ctx->llg_res.sample_mask;
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
||||||
auto token = cur_p->data[i].id;
|
|
||||||
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
|
|
||||||
cur_p->data[i].logit = -INFINITY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_sampler_llg_reset(llama_sampler * smpl) {
|
|
||||||
auto * ctx = (llama_sampler_llg *) smpl->ctx;
|
|
||||||
if (!ctx->grammar) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
|
|
||||||
llg_free_constraint(ctx->grammar);
|
|
||||||
ctx->grammar = grammar_new;
|
|
||||||
ctx->has_llg_res = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
|
|
||||||
const auto * ctx = (const llama_sampler_llg *) smpl->ctx;
|
|
||||||
|
|
||||||
auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr);
|
|
||||||
|
|
||||||
// copy the state
|
|
||||||
{
|
|
||||||
auto * result_ctx = (llama_sampler_llg *) result->ctx;
|
|
||||||
|
|
||||||
if (ctx->grammar) {
|
|
||||||
result_ctx->grammar_kind = ctx->grammar_kind;
|
|
||||||
result_ctx->grammar_data = ctx->grammar_data;
|
|
||||||
result_ctx->grammar = llg_clone_constraint(ctx->grammar);
|
|
||||||
result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_sampler_llg_free(llama_sampler * smpl) {
|
|
||||||
const auto * ctx = (llama_sampler_llg *) smpl->ctx;
|
|
||||||
|
|
||||||
if (ctx->grammar) {
|
|
||||||
llg_free_constraint(ctx->grammar);
|
|
||||||
llg_free_tokenizer(ctx->tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
static llama_sampler_i llama_sampler_llg_i = {
|
|
||||||
/* .name = */ llama_sampler_llg_name,
|
|
||||||
/* .accept = */ llama_sampler_llg_accept_impl,
|
|
||||||
/* .apply = */ llama_sampler_llg_apply,
|
|
||||||
/* .reset = */ llama_sampler_llg_reset,
|
|
||||||
/* .clone = */ llama_sampler_llg_clone,
|
|
||||||
/* .free = */ llama_sampler_llg_free,
|
|
||||||
};
|
|
||||||
|
|
||||||
static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
|
|
||||||
uint32_t * output_tokens, size_t output_tokens_len) {
|
|
||||||
const llama_vocab * vocab = (const llama_vocab *) user_data;
|
|
||||||
int r = 0;
|
|
||||||
try {
|
|
||||||
r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false,
|
|
||||||
true);
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
GGML_ABORT("llama_tokenize failed: %s\n", e.what());
|
|
||||||
}
|
|
||||||
if (r < 0) {
|
|
||||||
return -r;
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) {
|
|
||||||
// TODO store the tokenizer in the vocab somehow
|
|
||||||
static const llama_vocab * vocab_cache;
|
|
||||||
static LlgTokenizer * tokenizer_cache;
|
|
||||||
|
|
||||||
if (vocab_cache == vocab) {
|
|
||||||
return llg_clone_tokenizer(tokenizer_cache);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto tok_eos = llama_vocab_eot(vocab);
|
|
||||||
if (tok_eos == LLAMA_TOKEN_NULL) {
|
|
||||||
tok_eos = llama_vocab_eos(vocab);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t vocab_size = llama_vocab_n_tokens(vocab);
|
|
||||||
|
|
||||||
auto token_lens = new uint32_t[vocab_size];
|
|
||||||
// we typically have ~7 bytes per token; let's go on the safe side here
|
|
||||||
auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
|
|
||||||
auto token_bytes = new uint8_t[token_bytes_size];
|
|
||||||
|
|
||||||
size_t offset = 0;
|
|
||||||
for (size_t i = 0; i < vocab_size; i++) {
|
|
||||||
size_t max_token = 1024;
|
|
||||||
if (token_bytes_size - offset < max_token) {
|
|
||||||
GGML_ABORT("token_bytes buffer too small\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token token = i;
|
|
||||||
auto dp = (char *) token_bytes + offset;
|
|
||||||
auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
|
|
||||||
if (size < 0) {
|
|
||||||
GGML_ABORT("llama_detokenize failed\n");
|
|
||||||
}
|
|
||||||
if (size == 0) {
|
|
||||||
size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
|
|
||||||
if (size < 0) {
|
|
||||||
GGML_ABORT("llama_detokenize failed\n");
|
|
||||||
}
|
|
||||||
if (size != 0) {
|
|
||||||
*dp = '\xff'; // special token prefix marker
|
|
||||||
size += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
token_lens[i] = size;
|
|
||||||
offset += size;
|
|
||||||
}
|
|
||||||
|
|
||||||
LlgTokenizerInit tinit = {
|
|
||||||
/* .vocab_size = */ (uint32_t) vocab_size,
|
|
||||||
/* .tok_eos = */ (uint32_t) tok_eos,
|
|
||||||
/* .token_lens = */ token_lens,
|
|
||||||
/* .token_bytes = */ token_bytes,
|
|
||||||
/* .tokenizer_json = */ nullptr,
|
|
||||||
/* .tokenize_assumes_string = */ true,
|
|
||||||
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
|
|
||||||
/* .use_approximate_greedy_tokenize_fn = */ false,
|
|
||||||
/* .tokenize_user_data = */ vocab,
|
|
||||||
};
|
|
||||||
|
|
||||||
char error_buffer[1024];
|
|
||||||
LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
|
|
||||||
|
|
||||||
delete[] token_bytes;
|
|
||||||
delete[] token_lens;
|
|
||||||
|
|
||||||
if (tokenizer == nullptr) {
|
|
||||||
LOG_ERR("llg tokenizer error: %s\n", error_buffer);
|
|
||||||
return tokenizer;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tokenizer_cache) {
|
|
||||||
llg_free_tokenizer(tokenizer_cache);
|
|
||||||
}
|
|
||||||
vocab_cache = vocab;
|
|
||||||
tokenizer_cache = tokenizer;
|
|
||||||
|
|
||||||
return llg_clone_tokenizer(tokenizer_cache);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind,
|
|
||||||
const char * grammar_data) {
|
|
||||||
auto * ctx = new llama_sampler_llg;
|
|
||||||
|
|
||||||
if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
|
|
||||||
auto tokenizer = llama_sampler_llg_new_tokenizer(vocab);
|
|
||||||
*ctx = {
|
|
||||||
/* .vocab = */ vocab,
|
|
||||||
/* .grammar_kind = */ grammar_kind,
|
|
||||||
/* .grammar_data = */ grammar_data,
|
|
||||||
/* .tokenizer = */ tokenizer,
|
|
||||||
/* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
|
|
||||||
/* .llg_res = */ {},
|
|
||||||
/* .has_llg_res = */ false,
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
*ctx = {
|
|
||||||
/* .vocab = */ vocab,
|
|
||||||
/* .grammar_kind = */ {},
|
|
||||||
/* .grammar_data = */ {},
|
|
||||||
/* .tokenizer = */ nullptr,
|
|
||||||
/* .grammar = */ nullptr,
|
|
||||||
/* .llg_res = */ {},
|
|
||||||
/* .has_llg_res = */ false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return llama_sampler_init(
|
|
||||||
/* .iface = */ &llama_sampler_llg_i,
|
|
||||||
/* .ctx = */ ctx
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) {
|
|
||||||
LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // LLAMA_USE_LLGUIDANCE
|
|
392
common/log.cpp
392
common/log.cpp
|
@ -1,392 +0,0 @@
|
||||||
#include "log.h"
|
|
||||||
|
|
||||||
#include <condition_variable>
|
|
||||||
#include <cstdarg>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <mutex>
|
|
||||||
#include <sstream>
|
|
||||||
#include <thread>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
|
||||||
|
|
||||||
void common_log_set_verbosity_thold(int verbosity) {
|
|
||||||
common_log_verbosity_thold = verbosity;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int64_t t_us() {
|
|
||||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
|
||||||
}
|
|
||||||
|
|
||||||
// colors
|
|
||||||
enum common_log_col : int {
|
|
||||||
COMMON_LOG_COL_DEFAULT = 0,
|
|
||||||
COMMON_LOG_COL_BOLD,
|
|
||||||
COMMON_LOG_COL_RED,
|
|
||||||
COMMON_LOG_COL_GREEN,
|
|
||||||
COMMON_LOG_COL_YELLOW,
|
|
||||||
COMMON_LOG_COL_BLUE,
|
|
||||||
COMMON_LOG_COL_MAGENTA,
|
|
||||||
COMMON_LOG_COL_CYAN,
|
|
||||||
COMMON_LOG_COL_WHITE,
|
|
||||||
};
|
|
||||||
|
|
||||||
// disable colors by default
|
|
||||||
static std::vector<const char *> g_col = {
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_log_entry {
|
|
||||||
enum ggml_log_level level;
|
|
||||||
|
|
||||||
bool prefix;
|
|
||||||
|
|
||||||
int64_t timestamp;
|
|
||||||
|
|
||||||
std::vector<char> msg;
|
|
||||||
|
|
||||||
// signals the worker thread to stop
|
|
||||||
bool is_end;
|
|
||||||
|
|
||||||
void print(FILE * file = nullptr) const {
|
|
||||||
FILE * fcur = file;
|
|
||||||
if (!fcur) {
|
|
||||||
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
|
||||||
// these messages will still be logged to a file
|
|
||||||
if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
fcur = stdout;
|
|
||||||
|
|
||||||
if (level != GGML_LOG_LEVEL_NONE) {
|
|
||||||
fcur = stderr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
|
|
||||||
if (timestamp) {
|
|
||||||
// [M.s.ms.us]
|
|
||||||
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
|
||||||
g_col[COMMON_LOG_COL_BLUE],
|
|
||||||
(int) (timestamp / 1000000 / 60),
|
|
||||||
(int) (timestamp / 1000000 % 60),
|
|
||||||
(int) (timestamp / 1000 % 1000),
|
|
||||||
(int) (timestamp % 1000),
|
|
||||||
g_col[COMMON_LOG_COL_DEFAULT]);
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (level) {
|
|
||||||
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
|
|
||||||
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
|
|
||||||
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
|
|
||||||
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(fcur, "%s", msg.data());
|
|
||||||
|
|
||||||
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
|
|
||||||
fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
|
|
||||||
}
|
|
||||||
|
|
||||||
fflush(fcur);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_log {
|
|
||||||
// default capacity - will be expanded if needed
|
|
||||||
common_log() : common_log(256) {}
|
|
||||||
|
|
||||||
common_log(size_t capacity) {
|
|
||||||
file = nullptr;
|
|
||||||
prefix = false;
|
|
||||||
timestamps = false;
|
|
||||||
running = false;
|
|
||||||
t_start = t_us();
|
|
||||||
|
|
||||||
// initial message size - will be expanded if longer messages arrive
|
|
||||||
entries.resize(capacity);
|
|
||||||
for (auto & entry : entries) {
|
|
||||||
entry.msg.resize(256);
|
|
||||||
}
|
|
||||||
|
|
||||||
head = 0;
|
|
||||||
tail = 0;
|
|
||||||
|
|
||||||
resume();
|
|
||||||
}
|
|
||||||
|
|
||||||
~common_log() {
|
|
||||||
pause();
|
|
||||||
if (file) {
|
|
||||||
fclose(file);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::mutex mtx;
|
|
||||||
std::thread thrd;
|
|
||||||
std::condition_variable cv;
|
|
||||||
|
|
||||||
FILE * file;
|
|
||||||
|
|
||||||
bool prefix;
|
|
||||||
bool timestamps;
|
|
||||||
bool running;
|
|
||||||
|
|
||||||
int64_t t_start;
|
|
||||||
|
|
||||||
// ring buffer of entries
|
|
||||||
std::vector<common_log_entry> entries;
|
|
||||||
size_t head;
|
|
||||||
size_t tail;
|
|
||||||
|
|
||||||
// worker thread copies into this
|
|
||||||
common_log_entry cur;
|
|
||||||
|
|
||||||
public:
|
|
||||||
void add(enum ggml_log_level level, const char * fmt, va_list args) {
|
|
||||||
std::lock_guard<std::mutex> lock(mtx);
|
|
||||||
|
|
||||||
if (!running) {
|
|
||||||
// discard messages while the worker thread is paused
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto & entry = entries[tail];
|
|
||||||
|
|
||||||
{
|
|
||||||
// cannot use args twice, so make a copy in case we need to expand the buffer
|
|
||||||
va_list args_copy;
|
|
||||||
va_copy(args_copy, args);
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
|
|
||||||
if (n >= entry.msg.size()) {
|
|
||||||
entry.msg.resize(n + 1);
|
|
||||||
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// hack for bolding arguments
|
|
||||||
|
|
||||||
std::stringstream ss;
|
|
||||||
for (int i = 0; fmt[i] != 0; i++) {
|
|
||||||
if (fmt[i] == '%') {
|
|
||||||
ss << LOG_COL_BOLD;
|
|
||||||
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
|
|
||||||
ss << LOG_COL_DEFAULT;
|
|
||||||
if (fmt[i] == 0) break;
|
|
||||||
}
|
|
||||||
ss << fmt[i];
|
|
||||||
}
|
|
||||||
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
|
|
||||||
if (n >= entry.msg.size()) {
|
|
||||||
entry.msg.resize(n + 1);
|
|
||||||
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
va_end(args_copy);
|
|
||||||
}
|
|
||||||
|
|
||||||
entry.level = level;
|
|
||||||
entry.prefix = prefix;
|
|
||||||
entry.timestamp = 0;
|
|
||||||
if (timestamps) {
|
|
||||||
entry.timestamp = t_us() - t_start;
|
|
||||||
}
|
|
||||||
entry.is_end = false;
|
|
||||||
|
|
||||||
tail = (tail + 1) % entries.size();
|
|
||||||
if (tail == head) {
|
|
||||||
// expand the buffer
|
|
||||||
std::vector<common_log_entry> new_entries(2*entries.size());
|
|
||||||
|
|
||||||
size_t new_tail = 0;
|
|
||||||
|
|
||||||
do {
|
|
||||||
new_entries[new_tail] = std::move(entries[head]);
|
|
||||||
|
|
||||||
head = (head + 1) % entries.size();
|
|
||||||
new_tail = (new_tail + 1);
|
|
||||||
} while (head != tail);
|
|
||||||
|
|
||||||
head = 0;
|
|
||||||
tail = new_tail;
|
|
||||||
|
|
||||||
for (size_t i = tail; i < new_entries.size(); i++) {
|
|
||||||
new_entries[i].msg.resize(256);
|
|
||||||
}
|
|
||||||
|
|
||||||
entries = std::move(new_entries);
|
|
||||||
}
|
|
||||||
|
|
||||||
cv.notify_one();
|
|
||||||
}
|
|
||||||
|
|
||||||
void resume() {
|
|
||||||
std::lock_guard<std::mutex> lock(mtx);
|
|
||||||
|
|
||||||
if (running) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
running = true;
|
|
||||||
|
|
||||||
thrd = std::thread([this]() {
|
|
||||||
while (true) {
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(mtx);
|
|
||||||
cv.wait(lock, [this]() { return head != tail; });
|
|
||||||
|
|
||||||
cur = entries[head];
|
|
||||||
|
|
||||||
head = (head + 1) % entries.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cur.is_end) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
cur.print(); // stdout and stderr
|
|
||||||
|
|
||||||
if (file) {
|
|
||||||
cur.print(file);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
void pause() {
|
|
||||||
{
|
|
||||||
std::lock_guard<std::mutex> lock(mtx);
|
|
||||||
|
|
||||||
if (!running) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
running = false;
|
|
||||||
|
|
||||||
// push an entry to signal the worker thread to stop
|
|
||||||
{
|
|
||||||
auto & entry = entries[tail];
|
|
||||||
entry.is_end = true;
|
|
||||||
|
|
||||||
tail = (tail + 1) % entries.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
cv.notify_one();
|
|
||||||
}
|
|
||||||
|
|
||||||
thrd.join();
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_file(const char * path) {
|
|
||||||
pause();
|
|
||||||
|
|
||||||
if (file) {
|
|
||||||
fclose(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (path) {
|
|
||||||
file = fopen(path, "w");
|
|
||||||
} else {
|
|
||||||
file = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
resume();
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_colors(bool colors) {
|
|
||||||
pause();
|
|
||||||
|
|
||||||
if (colors) {
|
|
||||||
g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
|
||||||
g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
|
|
||||||
g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
|
|
||||||
g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
|
|
||||||
g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
|
||||||
g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
|
|
||||||
g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
|
||||||
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
|
|
||||||
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
|
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < g_col.size(); i++) {
|
|
||||||
g_col[i] = "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
resume();
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_prefix(bool prefix) {
|
|
||||||
std::lock_guard<std::mutex> lock(mtx);
|
|
||||||
|
|
||||||
this->prefix = prefix;
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_timestamps(bool timestamps) {
|
|
||||||
std::lock_guard<std::mutex> lock(mtx);
|
|
||||||
|
|
||||||
this->timestamps = timestamps;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
|
||||||
// public API
|
|
||||||
//
|
|
||||||
|
|
||||||
struct common_log * common_log_init() {
|
|
||||||
return new common_log;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct common_log * common_log_main() {
|
|
||||||
static struct common_log log;
|
|
||||||
|
|
||||||
return &log;
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_log_pause(struct common_log * log) {
|
|
||||||
log->pause();
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_log_resume(struct common_log * log) {
|
|
||||||
log->resume();
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_log_free(struct common_log * log) {
|
|
||||||
delete log;
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
|
|
||||||
va_list args;
|
|
||||||
va_start(args, fmt);
|
|
||||||
log->add(level, fmt, args);
|
|
||||||
va_end(args);
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_log_set_file(struct common_log * log, const char * file) {
|
|
||||||
log->set_file(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_log_set_colors(struct common_log * log, bool colors) {
|
|
||||||
log->set_colors(colors);
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
|
||||||
log->set_prefix(prefix);
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
|
||||||
log->set_timestamps(timestamps);
|
|
||||||
}
|
|
785
common/log.h
785
common/log.h
|
@ -1,103 +1,724 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h" // for ggml_log_level
|
#include <chrono>
|
||||||
|
#include <cstring>
|
||||||
|
#include <sstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cinttypes>
|
||||||
|
|
||||||
#define LOG_CLR_TO_EOL "\033[K\r"
|
// --------------------------------
|
||||||
#define LOG_COL_DEFAULT "\033[0m"
|
//
|
||||||
#define LOG_COL_BOLD "\033[1m"
|
// Basic usage:
|
||||||
#define LOG_COL_RED "\033[31m"
|
//
|
||||||
#define LOG_COL_GREEN "\033[32m"
|
// --------
|
||||||
#define LOG_COL_YELLOW "\033[33m"
|
//
|
||||||
#define LOG_COL_BLUE "\033[34m"
|
// The LOG() and LOG_TEE() macros are ready to go by default
|
||||||
#define LOG_COL_MAGENTA "\033[35m"
|
// they do not require any initialization.
|
||||||
#define LOG_COL_CYAN "\033[36m"
|
//
|
||||||
#define LOG_COL_WHITE "\033[37m"
|
// LOGLN() and LOG_TEELN() are variants which automatically
|
||||||
|
// include \n character at the end of the log string.
|
||||||
|
//
|
||||||
|
// LOG() behaves exactly like printf, by default writing to a logfile.
|
||||||
|
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
|
||||||
|
//
|
||||||
|
// Default logfile is named
|
||||||
|
// "llama.<threadID>.log"
|
||||||
|
// Default LOG_TEE() secondary output target is
|
||||||
|
// stderr
|
||||||
|
//
|
||||||
|
// Logs can be dynamically disabled or enabled using functions:
|
||||||
|
// log_disable()
|
||||||
|
// and
|
||||||
|
// log_enable()
|
||||||
|
//
|
||||||
|
// A log target can be changed with:
|
||||||
|
// log_set_target( string )
|
||||||
|
// creating and opening, or re-opening a file by string filename
|
||||||
|
// or
|
||||||
|
// log_set_target( FILE* )
|
||||||
|
// allowing to point at stderr, stdout, or any valid FILE* file handler.
|
||||||
|
//
|
||||||
|
// --------
|
||||||
|
//
|
||||||
|
// End of Basic usage.
|
||||||
|
//
|
||||||
|
// --------------------------------
|
||||||
|
|
||||||
#ifndef __GNUC__
|
// Specifies a log target.
|
||||||
# define LOG_ATTRIBUTE_FORMAT(...)
|
// default uses log_handler() with "llama.log" log file
|
||||||
#elif defined(__MINGW32__)
|
// this can be changed, by defining LOG_TARGET
|
||||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
// like so:
|
||||||
#else
|
//
|
||||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
// #define LOG_TARGET (a valid FILE*)
|
||||||
|
// #include "log.h"
|
||||||
|
//
|
||||||
|
// or it can be simply redirected to stdout or stderr
|
||||||
|
// like so:
|
||||||
|
//
|
||||||
|
// #define LOG_TARGET stderr
|
||||||
|
// #include "log.h"
|
||||||
|
//
|
||||||
|
// The log target can also be redirected to a different function
|
||||||
|
// like so:
|
||||||
|
//
|
||||||
|
// #define LOG_TARGET log_handler_different()
|
||||||
|
// #include "log.h"
|
||||||
|
//
|
||||||
|
// FILE* log_handler_different()
|
||||||
|
// {
|
||||||
|
// return stderr;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// or:
|
||||||
|
//
|
||||||
|
// #define LOG_TARGET log_handler_another_one("somelog.log")
|
||||||
|
// #include "log.h"
|
||||||
|
//
|
||||||
|
// FILE* log_handler_another_one(char*filename)
|
||||||
|
// {
|
||||||
|
// static FILE* logfile = nullptr;
|
||||||
|
// (...)
|
||||||
|
// if( !logfile )
|
||||||
|
// {
|
||||||
|
// fopen(...)
|
||||||
|
// }
|
||||||
|
// (...)
|
||||||
|
// return logfile
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
#ifndef LOG_TARGET
|
||||||
|
#define LOG_TARGET log_handler()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LOG_DEFAULT_DEBUG 1
|
#ifndef LOG_TEE_TARGET
|
||||||
#define LOG_DEFAULT_LLAMA 0
|
#define LOG_TEE_TARGET stderr
|
||||||
|
#endif
|
||||||
|
|
||||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
// Utility for synchronizing log configuration state
|
||||||
// set via common_log_set_verbosity()
|
// since std::optional was introduced only in c++17
|
||||||
extern int common_log_verbosity_thold;
|
enum LogTriState
|
||||||
|
{
|
||||||
|
LogTriStateSame,
|
||||||
|
LogTriStateFalse,
|
||||||
|
LogTriStateTrue
|
||||||
|
};
|
||||||
|
|
||||||
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
||||||
|
inline std::string log_get_pid()
|
||||||
|
{
|
||||||
|
static std::string pid;
|
||||||
|
if (pid.empty())
|
||||||
|
{
|
||||||
|
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||||
|
// it's not the same as "pid" but is unique enough to solve multiple instances
|
||||||
|
// trying to write to the same log.
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << std::this_thread::get_id();
|
||||||
|
pid = ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
// the common_log uses an internal worker thread to print/write log messages
|
return pid;
|
||||||
// when the worker thread is paused, incoming log messages are discarded
|
}
|
||||||
struct common_log;
|
|
||||||
|
|
||||||
struct common_log * common_log_init();
|
// Utility function for generating log file names with unique id based on thread id.
|
||||||
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
|
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
||||||
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
|
// where the number is a runtime id of the current thread.
|
||||||
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
|
|
||||||
void common_log_free (struct common_log * log);
|
|
||||||
|
|
||||||
LOG_ATTRIBUTE_FORMAT(3, 4)
|
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
|
||||||
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
|
|
||||||
|
|
||||||
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
// INTERNAL, DO NOT USE
|
||||||
//
|
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
|
||||||
// regular log output:
|
{
|
||||||
//
|
static bool _multilog = false;
|
||||||
// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
|
||||||
// llm_load_tensors: ggml ctx size = 0.27 MiB
|
|
||||||
// llm_load_tensors: offloading 32 repeating layers to GPU
|
|
||||||
// llm_load_tensors: offloading non-repeating layers to GPU
|
|
||||||
//
|
|
||||||
// with prefix = true, timestamps = true, the log output will look like this:
|
|
||||||
//
|
|
||||||
// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
|
||||||
// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
|
|
||||||
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
|
||||||
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
|
||||||
//
|
|
||||||
// I - info (stdout, V = 0)
|
|
||||||
// W - warning (stderr, V = 0)
|
|
||||||
// E - error (stderr, V = 0)
|
|
||||||
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
|
||||||
//
|
|
||||||
|
|
||||||
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
if (multilog != LogTriStateSame)
|
||||||
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
|
{
|
||||||
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
_multilog = multilog == LogTriStateTrue;
|
||||||
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
}
|
||||||
|
|
||||||
// helper macros for logging
|
std::stringstream buf;
|
||||||
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
|
||||||
//
|
|
||||||
// for example:
|
|
||||||
//
|
|
||||||
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
|
||||||
//
|
|
||||||
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
|
|
||||||
//
|
|
||||||
|
|
||||||
#define LOG_TMPL(level, verbosity, ...) \
|
buf << log_file_basename;
|
||||||
|
if (_multilog)
|
||||||
|
{
|
||||||
|
buf << ".";
|
||||||
|
buf << log_get_pid();
|
||||||
|
}
|
||||||
|
buf << ".";
|
||||||
|
buf << log_file_extension;
|
||||||
|
|
||||||
|
return buf.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef LOG_DEFAULT_FILE_NAME
|
||||||
|
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Utility for turning #define values into string literals
|
||||||
|
// so we can have a define for stderr and
|
||||||
|
// we can print "stderr" instead of literal stderr, etc.
|
||||||
|
#define LOG_STRINGIZE1(s) #s
|
||||||
|
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
|
||||||
|
|
||||||
|
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
|
||||||
|
|
||||||
|
// Allows disabling timestamps.
|
||||||
|
// in order to disable, define LOG_NO_TIMESTAMPS
|
||||||
|
// like so:
|
||||||
|
//
|
||||||
|
// #define LOG_NO_TIMESTAMPS
|
||||||
|
// #include "log.h"
|
||||||
|
//
|
||||||
|
#ifndef LOG_NO_TIMESTAMPS
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||||
|
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||||
|
#else
|
||||||
|
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||||
|
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define LOG_TIMESTAMP_FMT "%s"
|
||||||
|
#define LOG_TIMESTAMP_VAL ,""
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LOG_TEE_TIMESTAMPS
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||||
|
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||||
|
#else
|
||||||
|
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||||
|
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define LOG_TEE_TIMESTAMP_FMT "%s"
|
||||||
|
#define LOG_TEE_TIMESTAMP_VAL ,""
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Allows disabling file/line/function prefix
|
||||||
|
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
|
||||||
|
// like so:
|
||||||
|
//
|
||||||
|
// #define LOG_NO_FILE_LINE_FUNCTION
|
||||||
|
// #include "log.h"
|
||||||
|
//
|
||||||
|
#ifndef LOG_NO_FILE_LINE_FUNCTION
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
|
||||||
|
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||||
|
#else
|
||||||
|
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
||||||
|
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define LOG_FLF_FMT "%s"
|
||||||
|
#define LOG_FLF_VAL ,""
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LOG_TEE_FILE_LINE_FUNCTION
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
|
||||||
|
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||||
|
#else
|
||||||
|
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
||||||
|
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define LOG_TEE_FLF_FMT "%s"
|
||||||
|
#define LOG_TEE_FLF_VAL ,""
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
// USE LOG() INSTEAD
|
||||||
|
//
|
||||||
|
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
||||||
|
#define LOG_IMPL(str, ...) \
|
||||||
do { \
|
do { \
|
||||||
if ((verbosity) <= common_log_verbosity_thold) { \
|
if (LOG_TARGET != nullptr) \
|
||||||
common_log_add(common_log_main(), (level), __VA_ARGS__); \
|
{ \
|
||||||
|
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
||||||
|
fflush(LOG_TARGET); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define LOG_IMPL(str, ...) \
|
||||||
|
do { \
|
||||||
|
if (LOG_TARGET != nullptr) \
|
||||||
|
{ \
|
||||||
|
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
||||||
|
fflush(LOG_TARGET); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
|
// INTERNAL, DO NOT USE
|
||||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
// USE LOG_TEE() INSTEAD
|
||||||
|
//
|
||||||
|
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
||||||
|
#define LOG_TEE_IMPL(str, ...) \
|
||||||
|
do { \
|
||||||
|
if (LOG_TARGET != nullptr) \
|
||||||
|
{ \
|
||||||
|
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
||||||
|
fflush(LOG_TARGET); \
|
||||||
|
} \
|
||||||
|
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
||||||
|
{ \
|
||||||
|
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
|
||||||
|
fflush(LOG_TEE_TARGET); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define LOG_TEE_IMPL(str, ...) \
|
||||||
|
do { \
|
||||||
|
if (LOG_TARGET != nullptr) \
|
||||||
|
{ \
|
||||||
|
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
||||||
|
fflush(LOG_TARGET); \
|
||||||
|
} \
|
||||||
|
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
||||||
|
{ \
|
||||||
|
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
|
||||||
|
fflush(LOG_TEE_TARGET); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
|
// The '\0' as a last argument, is a trick to bypass the silly
|
||||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
|
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
|
||||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
|
// so we can have a single macro which can be called just like printf.
|
||||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
|
|
||||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
|
|
||||||
|
|
||||||
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
// Main LOG macro.
|
||||||
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
// behaves like printf, and supports arguments the exact same way.
|
||||||
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
|
//
|
||||||
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
|
#if !defined(_MSC_VER) || defined(__clang__)
|
||||||
#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT, verbosity, __VA_ARGS__)
|
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
||||||
|
#else
|
||||||
|
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Main TEE macro.
|
||||||
|
// does the same as LOG
|
||||||
|
// and
|
||||||
|
// simultaneously writes stderr.
|
||||||
|
//
|
||||||
|
// Secondary target can be changed just like LOG_TARGET
|
||||||
|
// by defining LOG_TEE_TARGET
|
||||||
|
//
|
||||||
|
#if !defined(_MSC_VER) || defined(__clang__)
|
||||||
|
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
||||||
|
#else
|
||||||
|
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// LOG macro variants with auto endline.
|
||||||
|
#if !defined(_MSC_VER) || defined(__clang__)
|
||||||
|
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
||||||
|
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
||||||
|
#else
|
||||||
|
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
|
||||||
|
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
||||||
|
{
|
||||||
|
static bool _initialized = false;
|
||||||
|
static bool _append = false;
|
||||||
|
static bool _disabled = filename.empty() && target == nullptr;
|
||||||
|
static std::string log_current_filename{filename};
|
||||||
|
static FILE *log_current_target{target};
|
||||||
|
static FILE *logfile = nullptr;
|
||||||
|
|
||||||
|
if (change)
|
||||||
|
{
|
||||||
|
if (append != LogTriStateSame)
|
||||||
|
{
|
||||||
|
_append = append == LogTriStateTrue;
|
||||||
|
return logfile;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (disable == LogTriStateTrue)
|
||||||
|
{
|
||||||
|
// Disable primary target
|
||||||
|
_disabled = true;
|
||||||
|
}
|
||||||
|
// If previously disabled, only enable, and keep previous target
|
||||||
|
else if (disable == LogTriStateFalse)
|
||||||
|
{
|
||||||
|
_disabled = false;
|
||||||
|
}
|
||||||
|
// Otherwise, process the arguments
|
||||||
|
else if (log_current_filename != filename || log_current_target != target)
|
||||||
|
{
|
||||||
|
_initialized = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_disabled)
|
||||||
|
{
|
||||||
|
// Log is disabled
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_initialized)
|
||||||
|
{
|
||||||
|
// with fallback in case something went wrong
|
||||||
|
return logfile ? logfile : stderr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the (re)initialization
|
||||||
|
if (target != nullptr)
|
||||||
|
{
|
||||||
|
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
||||||
|
{
|
||||||
|
fclose(logfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
log_current_filename = LOG_DEFAULT_FILE_NAME;
|
||||||
|
log_current_target = target;
|
||||||
|
|
||||||
|
logfile = target;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (log_current_filename != filename)
|
||||||
|
{
|
||||||
|
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
||||||
|
{
|
||||||
|
fclose(logfile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logfile = fopen(filename.c_str(), _append ? "a" : "w");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!logfile)
|
||||||
|
{
|
||||||
|
// Verify whether the file was opened, otherwise fallback to stderr
|
||||||
|
logfile = stderr;
|
||||||
|
|
||||||
|
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
|
||||||
|
fflush(stderr);
|
||||||
|
|
||||||
|
// At this point we let the init flag be to true below, and let the target fallback to stderr
|
||||||
|
// otherwise we would repeatedly fopen() which was already unsuccessful
|
||||||
|
}
|
||||||
|
|
||||||
|
_initialized = true;
|
||||||
|
|
||||||
|
return logfile ? logfile : stderr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
||||||
|
{
|
||||||
|
return log_handler1_impl(change, append, disable, filename, target);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disables logs entirely at runtime.
|
||||||
|
// Makes LOG() and LOG_TEE() produce no output,
|
||||||
|
// until enabled back.
|
||||||
|
#define log_disable() log_disable_impl()
|
||||||
|
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_disable_impl()
|
||||||
|
{
|
||||||
|
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enables logs at runtime.
|
||||||
|
#define log_enable() log_enable_impl()
|
||||||
|
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_enable_impl()
|
||||||
|
{
|
||||||
|
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
||||||
|
#define log_set_target(target) log_set_target_impl(target)
|
||||||
|
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
|
||||||
|
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
|
||||||
|
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_handler() { return log_handler1_impl(); }
|
||||||
|
|
||||||
|
// Enable or disable creating separate log files for each run.
|
||||||
|
// can ONLY be invoked BEFORE first log use.
|
||||||
|
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
|
||||||
|
// Enable or disable append mode for log file.
|
||||||
|
// can ONLY be invoked BEFORE first log use.
|
||||||
|
#define log_append(enable) log_append_impl(enable)
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline FILE *log_append_impl(bool enable)
|
||||||
|
{
|
||||||
|
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void log_test()
|
||||||
|
{
|
||||||
|
log_disable();
|
||||||
|
LOG("01 Hello World to nobody, because logs are disabled!\n");
|
||||||
|
log_enable();
|
||||||
|
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
|
||||||
|
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
|
||||||
|
log_set_target(stderr);
|
||||||
|
LOG("04 Hello World to stderr!\n");
|
||||||
|
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
|
||||||
|
log_set_target(LOG_DEFAULT_FILE_NAME);
|
||||||
|
LOG("06 Hello World to default log file!\n");
|
||||||
|
log_set_target(stdout);
|
||||||
|
LOG("07 Hello World to stdout!\n");
|
||||||
|
log_set_target(LOG_DEFAULT_FILE_NAME);
|
||||||
|
LOG("08 Hello World to default log file again!\n");
|
||||||
|
log_disable();
|
||||||
|
LOG("09 Hello World _1_ into the void!\n");
|
||||||
|
log_enable();
|
||||||
|
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
|
||||||
|
log_disable();
|
||||||
|
log_set_target("llama.anotherlog.log");
|
||||||
|
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
|
||||||
|
log_enable();
|
||||||
|
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
|
||||||
|
log_set_target("llama.yetanotherlog.log");
|
||||||
|
LOG("13 Hello World this time in yet new file?\n");
|
||||||
|
log_set_target(log_filename_generator("llama_autonamed", "log"));
|
||||||
|
LOG("14 Hello World in log with generated filename!\n");
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
LOG_TEE("15 Hello msvc TEE without arguments\n");
|
||||||
|
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
|
||||||
|
LOG_TEELN("17 Hello msvc TEELN without arguments\n");
|
||||||
|
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
|
||||||
|
LOG("19 Hello msvc LOG without arguments\n");
|
||||||
|
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
|
||||||
|
LOGLN("21 Hello msvc LOGLN without arguments\n");
|
||||||
|
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool log_param_single_parse(const std::string & param)
|
||||||
|
{
|
||||||
|
if ( param == "--log-test")
|
||||||
|
{
|
||||||
|
log_test();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( param == "--log-disable")
|
||||||
|
{
|
||||||
|
log_disable();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( param == "--log-enable")
|
||||||
|
{
|
||||||
|
log_enable();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (param == "--log-new")
|
||||||
|
{
|
||||||
|
log_multilog(true);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (param == "--log-append")
|
||||||
|
{
|
||||||
|
log_append(true);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
|
||||||
|
{
|
||||||
|
if ( param == "--log-file")
|
||||||
|
{
|
||||||
|
if (!check_but_dont_parse)
|
||||||
|
{
|
||||||
|
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void log_print_usage()
|
||||||
|
{
|
||||||
|
printf("log options:\n");
|
||||||
|
/* format
|
||||||
|
printf(" -h, --help show this help message and exit\n");*/
|
||||||
|
/* spacing
|
||||||
|
printf("__-param----------------Description\n");*/
|
||||||
|
printf(" --log-test Run simple logging test\n");
|
||||||
|
printf(" --log-disable Disable trace logs\n");
|
||||||
|
printf(" --log-enable Enable trace logs\n");
|
||||||
|
printf(" --log-file Specify a log filename (without extension)\n");
|
||||||
|
printf(" --log-new Create a separate new log file on start. "
|
||||||
|
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
|
||||||
|
printf(" --log-append Don't truncate the old log file.\n");
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
||||||
|
|
||||||
|
// INTERNAL, DO NOT USE
|
||||||
|
inline void log_dump_cmdline_impl(int argc, char **argv)
|
||||||
|
{
|
||||||
|
std::stringstream buf;
|
||||||
|
for (int i = 0; i < argc; ++i)
|
||||||
|
{
|
||||||
|
if (std::string(argv[i]).find(' ') != std::string::npos)
|
||||||
|
{
|
||||||
|
buf << " \"" << argv[i] <<"\"";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
buf << " " << argv[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOGLN("Cmd:%s", buf.str().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
#define log_tostr(var) log_var_to_string_impl(var).c_str()
|
||||||
|
|
||||||
|
inline std::string log_var_to_string_impl(bool var)
|
||||||
|
{
|
||||||
|
return var ? "true" : "false";
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string log_var_to_string_impl(std::string var)
|
||||||
|
{
|
||||||
|
return var;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string log_var_to_string_impl(const std::vector<int> & var)
|
||||||
|
{
|
||||||
|
std::stringstream buf;
|
||||||
|
buf << "[ ";
|
||||||
|
bool first = true;
|
||||||
|
for (auto e : var)
|
||||||
|
{
|
||||||
|
if (first)
|
||||||
|
{
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
buf << ", ";
|
||||||
|
}
|
||||||
|
buf << std::to_string(e);
|
||||||
|
}
|
||||||
|
buf << " ]";
|
||||||
|
|
||||||
|
return buf.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename C, typename T>
|
||||||
|
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
|
||||||
|
{
|
||||||
|
std::stringstream buf;
|
||||||
|
buf << "[ ";
|
||||||
|
|
||||||
|
bool first = true;
|
||||||
|
for (const auto & token : tokens)
|
||||||
|
{
|
||||||
|
if (!first) {
|
||||||
|
buf << ", ";
|
||||||
|
} else {
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto detokenized = llama_token_to_piece(ctx, token);
|
||||||
|
|
||||||
|
detokenized.erase(
|
||||||
|
std::remove_if(
|
||||||
|
detokenized.begin(),
|
||||||
|
detokenized.end(),
|
||||||
|
[](const unsigned char c) { return !std::isprint(c); }),
|
||||||
|
detokenized.end());
|
||||||
|
|
||||||
|
buf
|
||||||
|
<< "'" << detokenized << "'"
|
||||||
|
<< ":" << std::to_string(token);
|
||||||
|
}
|
||||||
|
buf << " ]";
|
||||||
|
|
||||||
|
return buf.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename C, typename B>
|
||||||
|
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
|
||||||
|
{
|
||||||
|
std::stringstream buf;
|
||||||
|
buf << "[ ";
|
||||||
|
|
||||||
|
bool first = true;
|
||||||
|
for (int i = 0; i < batch.n_tokens; ++i)
|
||||||
|
{
|
||||||
|
if (!first) {
|
||||||
|
buf << ", ";
|
||||||
|
} else {
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
||||||
|
|
||||||
|
detokenized.erase(
|
||||||
|
std::remove_if(
|
||||||
|
detokenized.begin(),
|
||||||
|
detokenized.end(),
|
||||||
|
[](const unsigned char c) { return !std::isprint(c); }),
|
||||||
|
detokenized.end());
|
||||||
|
|
||||||
|
buf
|
||||||
|
<< "\n" << std::to_string(i)
|
||||||
|
<< ":token '" << detokenized << "'"
|
||||||
|
<< ":pos " << std::to_string(batch.pos[i])
|
||||||
|
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
||||||
|
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
||||||
|
<< ":logits " << std::to_string(batch.logits[i]);
|
||||||
|
}
|
||||||
|
buf << " ]";
|
||||||
|
|
||||||
|
return buf.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
#undef LOG
|
||||||
|
#define LOG(...) // dummy stub
|
||||||
|
#undef LOGLN
|
||||||
|
#define LOGLN(...) // dummy stub
|
||||||
|
|
||||||
|
#undef LOG_TEE
|
||||||
|
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
|
||||||
|
|
||||||
|
#undef LOG_TEELN
|
||||||
|
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
|
||||||
|
|
||||||
|
#undef LOG_DISABLE
|
||||||
|
#define LOG_DISABLE() // dummy stub
|
||||||
|
|
||||||
|
#undef LOG_ENABLE
|
||||||
|
#define LOG_ENABLE() // dummy stub
|
||||||
|
|
||||||
|
#undef LOG_ENABLE
|
||||||
|
#define LOG_ENABLE() // dummy stub
|
||||||
|
|
||||||
|
#undef LOG_SET_TARGET
|
||||||
|
#define LOG_SET_TARGET(...) // dummy stub
|
||||||
|
|
||||||
|
#undef LOG_DUMP_CMDLINE
|
||||||
|
#define LOG_DUMP_CMDLINE(...) // dummy stub
|
||||||
|
|
||||||
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
2883
common/minja.hpp
2883
common/minja.hpp
File diff suppressed because it is too large
Load diff
|
@ -2,13 +2,10 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
#include <cinttypes>
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <thread>
|
|
||||||
|
|
||||||
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||||
const int64_t t_start_ms = ggml_time_ms();
|
const int64_t t_start_ms = ggml_time_ms();
|
||||||
const int64_t inp_size = inp.size();
|
const int64_t inp_size = inp.size();
|
||||||
|
@ -20,16 +17,16 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
|
||||||
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
||||||
for (int64_t i = i_start; i < inp_size; ++i) {
|
for (int64_t i = i_start; i < inp_size; ++i) {
|
||||||
const int64_t ngram_start = i - ngram_size;
|
const int64_t ngram_start = i - ngram_size;
|
||||||
common_ngram ngram(&inp[ngram_start], ngram_size);
|
llama_ngram ngram(&inp[ngram_start], ngram_size);
|
||||||
const llama_token token = inp[i];
|
const llama_token token = inp[i];
|
||||||
|
|
||||||
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
||||||
if (part_it == ngram_cache.end()) {
|
if (part_it == ngram_cache.end()) {
|
||||||
common_ngram_cache_part part;
|
llama_ngram_cache_part part;
|
||||||
part.emplace(token, 1);
|
part.emplace(token, 1);
|
||||||
ngram_cache.emplace(ngram, part);
|
ngram_cache.emplace(ngram, part);
|
||||||
} else {
|
} else {
|
||||||
common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
||||||
if (token_count_it == part_it->second.end()) {
|
if (token_count_it == part_it->second.end()) {
|
||||||
part_it->second.emplace(token, 1);
|
part_it->second.emplace(token, 1);
|
||||||
} else {
|
} else {
|
||||||
|
@ -62,16 +59,16 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
|
||||||
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
||||||
|
|
||||||
// Helper function that tries to draft a token from only the static ngram cache:
|
// Helper function that tries to draft a token from only the static ngram cache:
|
||||||
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
|
||||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
if (part_static_it == nc_static.end()) {
|
if (part_static_it == nc_static.end()) {
|
||||||
return LLAMA_TOKEN_NULL;
|
return -1;
|
||||||
}
|
}
|
||||||
const common_ngram_cache_part part_static = part_static_it->second;
|
const llama_ngram_cache_part part_static = part_static_it->second;
|
||||||
|
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_static = 0;
|
int sum_count_static = 0;
|
||||||
llama_token max_token = LLAMA_TOKEN_NULL;
|
llama_token max_token = -1;
|
||||||
|
|
||||||
for (std::pair<llama_token, int> token_count_static : part_static) {
|
for (std::pair<llama_token, int> token_count_static : part_static) {
|
||||||
const llama_token token = token_count_static.first;
|
const llama_token token = token_count_static.first;
|
||||||
|
@ -85,39 +82,39 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
||||||
return LLAMA_TOKEN_NULL;
|
return -1;
|
||||||
}
|
}
|
||||||
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
||||||
return LLAMA_TOKEN_NULL;
|
return -1;
|
||||||
}
|
}
|
||||||
return max_token;
|
return max_token;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
||||||
static llama_token try_draft(
|
static llama_token try_draft(
|
||||||
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
|
||||||
const int * min_sample_size, const int * min_percent) {
|
const int * min_sample_size, const int * min_percent) {
|
||||||
|
|
||||||
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
llama_token drafted_token = -1;
|
||||||
|
|
||||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
|
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
||||||
const common_ngram ngram_primary = ngrams_primary[i];
|
const llama_ngram ngram_primary = ngrams_primary[i];
|
||||||
|
|
||||||
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
||||||
if (part_primary_it == nc_primary.end()) {
|
if (part_primary_it == nc_primary.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const common_ngram_cache_part part_primary = part_primary_it->second;
|
const llama_ngram_cache_part part_primary = part_primary_it->second;
|
||||||
|
|
||||||
int max_count_primary = 0;
|
int max_count_primary = 0;
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_primary = 0;
|
int sum_count_primary = 0;
|
||||||
llama_token max_token = LLAMA_TOKEN_NULL;
|
llama_token max_token = -1;
|
||||||
|
|
||||||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
||||||
const llama_token token = token_count_primary.first;
|
const llama_token token = token_count_primary.first;
|
||||||
|
|
||||||
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
||||||
|
|
||||||
const int32_t count_primary = token_count_primary.second;
|
const int32_t count_primary = token_count_primary.second;
|
||||||
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
|
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
|
||||||
|
@ -142,9 +139,9 @@ static llama_token try_draft(
|
||||||
return drafted_token;
|
return drafted_token;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_ngram_cache_draft(
|
void llama_ngram_cache_draft(
|
||||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||||
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
|
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(draft.size() == 1);
|
GGML_ASSERT(draft.size() == 1);
|
||||||
const int inp_size = inp.size();
|
const int inp_size = inp.size();
|
||||||
|
@ -154,40 +151,40 @@ void common_ngram_cache_draft(
|
||||||
}
|
}
|
||||||
|
|
||||||
while ((int) draft.size()-1 < n_draft) {
|
while ((int) draft.size()-1 < n_draft) {
|
||||||
llama_token drafted_token = LLAMA_TOKEN_NULL;
|
llama_token drafted_token = -1;
|
||||||
|
|
||||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||||
common_ngram ngram_static;
|
llama_ngram ngram_static;
|
||||||
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
||||||
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
||||||
}
|
}
|
||||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
common_ngram_cache_part part_static;
|
llama_ngram_cache_part part_static;
|
||||||
if (part_static_it != nc_static.end()) {
|
if (part_static_it != nc_static.end()) {
|
||||||
part_static = part_static_it->second;
|
part_static = part_static_it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
// cd = context + dynamic
|
// cd = context + dynamic
|
||||||
std::vector<common_ngram> ngrams_cd;
|
std::vector<llama_ngram> ngrams_cd;
|
||||||
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
|
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
|
||||||
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
||||||
common_ngram ngram_cd;
|
llama_ngram ngram_cd;
|
||||||
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
||||||
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
||||||
}
|
}
|
||||||
ngrams_cd.push_back(ngram_cd);
|
ngrams_cd.push_back(ngram_cd);
|
||||||
}
|
}
|
||||||
if (drafted_token == LLAMA_TOKEN_NULL) {
|
if (drafted_token == -1) {
|
||||||
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
||||||
}
|
}
|
||||||
if (drafted_token == LLAMA_TOKEN_NULL) {
|
if (drafted_token == -1) {
|
||||||
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
||||||
}
|
}
|
||||||
if (drafted_token == LLAMA_TOKEN_NULL) {
|
if (drafted_token == -1) {
|
||||||
drafted_token = try_draft(nc_static, ngram_static);
|
drafted_token = try_draft(nc_static, ngram_static);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (drafted_token == LLAMA_TOKEN_NULL) {
|
if (drafted_token == -1) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -196,16 +193,16 @@ void common_ngram_cache_draft(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
|
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
|
||||||
std::ofstream file_out(filename, std::ios::binary);
|
std::ofstream file_out(filename, std::ios::binary);
|
||||||
for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
|
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
|
||||||
const common_ngram ngram = item.first;
|
const llama_ngram ngram = item.first;
|
||||||
common_ngram_cache_part token_counts = item.second;
|
llama_ngram_cache_part token_counts = item.second;
|
||||||
GGML_ASSERT(!token_counts.empty());
|
GGML_ASSERT(!token_counts.empty());
|
||||||
const int32_t ntokens = token_counts.size();
|
const int32_t ntokens = token_counts.size();
|
||||||
GGML_ASSERT(ntokens > 0);
|
GGML_ASSERT(ntokens > 0);
|
||||||
|
|
||||||
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
|
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
|
||||||
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
||||||
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
||||||
const llama_token token = item2.first;
|
const llama_token token = item2.first;
|
||||||
|
@ -219,14 +216,14 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
||||||
std::ifstream hashmap_file(filename, std::ios::binary);
|
std::ifstream hashmap_file(filename, std::ios::binary);
|
||||||
if (!hashmap_file) {
|
if (!hashmap_file) {
|
||||||
throw std::ifstream::failure("Unable to open file " + filename);
|
throw std::ifstream::failure("Unable to open file " + filename);
|
||||||
}
|
}
|
||||||
common_ngram_cache ngram_cache;
|
llama_ngram_cache ngram_cache;
|
||||||
|
|
||||||
common_ngram ngram;
|
llama_ngram ngram;
|
||||||
int32_t ntokens;
|
int32_t ntokens;
|
||||||
llama_token token;
|
llama_token token;
|
||||||
int32_t count;
|
int32_t count;
|
||||||
|
@ -235,11 +232,11 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
||||||
char * ntokensc = reinterpret_cast<char*>(&ntokens);
|
char * ntokensc = reinterpret_cast<char*>(&ntokens);
|
||||||
char * tokenc = reinterpret_cast<char*>(&token);
|
char * tokenc = reinterpret_cast<char*>(&token);
|
||||||
char * countc = reinterpret_cast<char*>(&count);
|
char * countc = reinterpret_cast<char*>(&count);
|
||||||
while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
|
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
|
||||||
GGML_ASSERT(!hashmap_file.eof());
|
GGML_ASSERT(!hashmap_file.eof());
|
||||||
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
|
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
|
||||||
GGML_ASSERT(ntokens > 0);
|
GGML_ASSERT(ntokens > 0);
|
||||||
common_ngram_cache_part token_counts;
|
llama_ngram_cache_part token_counts;
|
||||||
|
|
||||||
for (int i = 0; i < ntokens; ++i) {
|
for (int i = 0; i < ntokens; ++i) {
|
||||||
GGML_ASSERT(!hashmap_file.eof());
|
GGML_ASSERT(!hashmap_file.eof());
|
||||||
|
@ -257,12 +254,12 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
||||||
return ngram_cache;
|
return ngram_cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
|
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
|
||||||
for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
|
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
|
||||||
const common_ngram ngram = ngram_part.first;
|
const llama_ngram ngram = ngram_part.first;
|
||||||
common_ngram_cache_part part = ngram_part.second;
|
llama_ngram_cache_part part = ngram_part.second;
|
||||||
|
|
||||||
common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
||||||
if (part_merged_it == ngram_cache_target.end()) {
|
if (part_merged_it == ngram_cache_target.end()) {
|
||||||
ngram_cache_target.emplace(ngram, part);
|
ngram_cache_target.emplace(ngram, part);
|
||||||
continue;
|
continue;
|
||||||
|
@ -273,7 +270,7 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng
|
||||||
const int32_t count = token_count.second;
|
const int32_t count = token_count.second;
|
||||||
GGML_ASSERT(count > 0);
|
GGML_ASSERT(count > 0);
|
||||||
|
|
||||||
common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
||||||
if (token_count_merged_it == part_merged_it->second.end()) {
|
if (token_count_merged_it == part_merged_it->second.end()) {
|
||||||
part_merged_it->second.emplace(token, count);
|
part_merged_it->second.emplace(token, count);
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -12,22 +12,22 @@
|
||||||
|
|
||||||
// Data structures to map n-grams to empirical token probabilities:
|
// Data structures to map n-grams to empirical token probabilities:
|
||||||
|
|
||||||
struct common_ngram {
|
struct llama_ngram {
|
||||||
llama_token tokens[LLAMA_NGRAM_MAX];
|
llama_token tokens[LLAMA_NGRAM_MAX];
|
||||||
|
|
||||||
common_ngram() {
|
llama_ngram() {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = LLAMA_TOKEN_NULL;
|
tokens[i] = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
common_ngram(const llama_token * input, const int ngram_size) {
|
llama_ngram(const llama_token * input, const int ngram_size) {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
|
tokens[i] = i < ngram_size ? input[i] : -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator==(const common_ngram & other) const {
|
bool operator==(const llama_ngram & other) const {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
if (tokens[i] != other.tokens[i]) {
|
if (tokens[i] != other.tokens[i]) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -37,28 +37,28 @@ struct common_ngram {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_token_hash_function {
|
struct llama_token_hash_function {
|
||||||
size_t operator()(const llama_token token) const {
|
size_t operator()(const llama_token token) const {
|
||||||
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||||
return token * 11400714819323198485llu;
|
return token * 11400714819323198485llu;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_ngram_hash_function {
|
struct llama_ngram_hash_function {
|
||||||
size_t operator()(const common_ngram & ngram) const {
|
size_t operator()(const llama_ngram & ngram) const {
|
||||||
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
|
||||||
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
|
||||||
}
|
}
|
||||||
return hash;
|
return hash;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// token -> number of times token has been seen
|
// token -> number of times token has been seen
|
||||||
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
|
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
|
||||||
|
|
||||||
// n-gram -> empirical distribution of following tokens
|
// n-gram -> empirical distribution of following tokens
|
||||||
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
|
||||||
|
|
||||||
|
|
||||||
// Update an ngram cache with tokens.
|
// Update an ngram cache with tokens.
|
||||||
|
@ -70,8 +70,8 @@ typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_h
|
||||||
//
|
//
|
||||||
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
||||||
// Changes in the middle need a complete rebuild.
|
// Changes in the middle need a complete rebuild.
|
||||||
void common_ngram_cache_update(
|
void llama_ngram_cache_update(
|
||||||
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
||||||
|
|
||||||
// Try to draft tokens from ngram caches.
|
// Try to draft tokens from ngram caches.
|
||||||
// inp: the tokens generated so far.
|
// inp: the tokens generated so far.
|
||||||
|
@ -81,21 +81,21 @@ void common_ngram_cache_update(
|
||||||
// nc_context: ngram cache based on current context.
|
// nc_context: ngram cache based on current context.
|
||||||
// nc_dynamic: ngram cache based on previous user generations.
|
// nc_dynamic: ngram cache based on previous user generations.
|
||||||
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
||||||
void common_ngram_cache_draft(
|
void llama_ngram_cache_draft(
|
||||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||||
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
|
||||||
|
|
||||||
// Save an ngram cache to a file.
|
// Save an ngram cache to a file.
|
||||||
// ngram_cache: the ngram cache to save.
|
// ngram_cache: the ngram cache to save.
|
||||||
// filename: the path under which to save the ngram cache.
|
// filename: the path under which to save the ngram cache.
|
||||||
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
|
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
|
||||||
|
|
||||||
// Load an ngram cache saved with common_ngram_cache_save.
|
// Load an ngram cache saved with llama_ngram_cache_save.
|
||||||
// filename: the path from which to load the ngram cache.
|
// filename: the path from which to load the ngram cache.
|
||||||
// returns: an ngram cache containing the information saved to filename.
|
// returns: an ngram cache containing the information saved to filename.
|
||||||
common_ngram_cache common_ngram_cache_load(std::string & filename);
|
llama_ngram_cache llama_ngram_cache_load(std::string & filename);
|
||||||
|
|
||||||
// Merge two ngram caches.
|
// Merge two ngram caches.
|
||||||
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
||||||
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
||||||
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
|
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
|
||||||
|
|
|
@ -1,122 +1,395 @@
|
||||||
|
#define LLAMA_API_INTERNAL
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
#include <random>
|
||||||
|
|
||||||
#include "common.h"
|
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
||||||
|
struct llama_sampling_context * result = new llama_sampling_context();
|
||||||
|
|
||||||
#include <cmath>
|
result->params = params;
|
||||||
#include <unordered_map>
|
result->grammar = nullptr;
|
||||||
|
|
||||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
// if there is a grammar, parse it
|
||||||
// TODO: deduplicate with llama-impl.h
|
if (!params.grammar.empty()) {
|
||||||
template<typename T>
|
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
||||||
struct ring_buffer {
|
|
||||||
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
|
||||||
|
|
||||||
T & front() {
|
// will be empty (default) if there are parse errors
|
||||||
if (sz == 0) {
|
if (result->parsed_grammar.rules.empty()) {
|
||||||
throw std::runtime_error("ring buffer is empty");
|
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||||
}
|
delete result;
|
||||||
return data[first];
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const T & front() const {
|
// Ensure that there is a "root" node.
|
||||||
if (sz == 0) {
|
if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
|
||||||
throw std::runtime_error("ring buffer is empty");
|
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
|
||||||
}
|
delete result;
|
||||||
return data[first];
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
T & back() {
|
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
||||||
if (sz == 0) {
|
|
||||||
throw std::runtime_error("ring buffer is empty");
|
struct llama_grammar * grammar = llama_grammar_init(
|
||||||
|
grammar_rules.data(),
|
||||||
|
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
}
|
}
|
||||||
return data[pos];
|
result->grammar = grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
const T & back() const {
|
result->prev.resize(params.n_prev);
|
||||||
if (sz == 0) {
|
|
||||||
throw std::runtime_error("ring buffer is empty");
|
|
||||||
}
|
|
||||||
return data[pos];
|
|
||||||
}
|
|
||||||
|
|
||||||
void push_back(const T & value) {
|
result->n_valid = 0;
|
||||||
if (sz == capacity) {
|
|
||||||
// advance the start when buffer is full
|
|
||||||
first = (first + 1) % capacity;
|
|
||||||
} else {
|
|
||||||
sz++;
|
|
||||||
}
|
|
||||||
data[pos] = value;
|
|
||||||
pos = (pos + 1) % capacity;
|
|
||||||
}
|
|
||||||
|
|
||||||
T pop_front() {
|
llama_sampling_set_rng_seed(result, params.seed);
|
||||||
if (sz == 0) {
|
|
||||||
throw std::runtime_error("ring buffer is empty");
|
|
||||||
}
|
|
||||||
T value = data[first];
|
|
||||||
first = (first + 1) % capacity;
|
|
||||||
sz--;
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
const T & rat(size_t i) const {
|
|
||||||
if (i >= sz) {
|
|
||||||
throw std::runtime_error("ring buffer: index out of bounds");
|
|
||||||
}
|
|
||||||
return data[(first + sz - i - 1) % capacity];
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<T> to_vector() const {
|
|
||||||
std::vector<T> result;
|
|
||||||
result.reserve(sz);
|
|
||||||
for (size_t i = 0; i < sz; i++) {
|
|
||||||
result.push_back(data[(first + i) % capacity]);
|
|
||||||
}
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void clear() {
|
void llama_sampling_free(struct llama_sampling_context * ctx) {
|
||||||
// here only reset the status of the buffer
|
if (ctx->grammar != NULL) {
|
||||||
sz = 0;
|
llama_grammar_free(ctx->grammar);
|
||||||
first = 0;
|
|
||||||
pos = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool empty() const {
|
delete ctx;
|
||||||
return sz == 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t size() const {
|
void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
return sz;
|
if (ctx->grammar != NULL) {
|
||||||
|
llama_grammar_free(ctx->grammar);
|
||||||
|
ctx->grammar = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t capacity = 0;
|
if (!ctx->parsed_grammar.rules.empty()) {
|
||||||
size_t sz = 0;
|
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
||||||
size_t first = 0;
|
|
||||||
size_t pos = 0;
|
struct llama_grammar * grammar = llama_grammar_init(
|
||||||
std::vector<T> data;
|
grammar_rules.data(),
|
||||||
|
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
|
ctx->grammar = grammar;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||||
|
ctx->cur.clear();
|
||||||
|
ctx->n_valid = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
||||||
|
if (seed == LLAMA_DEFAULT_SEED) {
|
||||||
|
seed = std::random_device{}();
|
||||||
|
}
|
||||||
|
ctx->rng.seed(seed);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
|
||||||
|
if (dst->grammar) {
|
||||||
|
llama_grammar_free(dst->grammar);
|
||||||
|
dst->grammar = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src->grammar) {
|
||||||
|
dst->grammar = llama_grammar_copy(src->grammar);
|
||||||
|
}
|
||||||
|
|
||||||
|
dst->prev = src->prev;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_sampling_last(llama_sampling_context * ctx) {
|
||||||
|
return ctx->prev.back();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
|
||||||
|
const int size = ctx_sampling->prev.size();
|
||||||
|
|
||||||
|
n = std::min(n, size);
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
for (int i = size - n; i < size; i++) {
|
||||||
|
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
|
char result[1024];
|
||||||
|
|
||||||
|
snprintf(result, sizeof(result),
|
||||||
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
|
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||||
|
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
||||||
|
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||||
|
|
||||||
|
return std::string(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
|
std::string result = "CFG -> Penalties ";
|
||||||
|
if (params.mirostat == 0) {
|
||||||
|
for (auto sampler_type : params.samplers_sequence) {
|
||||||
|
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
|
||||||
|
if (!sampler_type_name.empty()) {
|
||||||
|
result += "-> " + sampler_type_name + " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result += "-> mirostat ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
||||||
|
switch (sampler_type) {
|
||||||
|
case llama_sampler_type::TOP_K: return "top_k";
|
||||||
|
case llama_sampler_type::TFS_Z: return "tfs_z";
|
||||||
|
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
||||||
|
case llama_sampler_type::TOP_P: return "top_p";
|
||||||
|
case llama_sampler_type::MIN_P: return "min_p";
|
||||||
|
case llama_sampler_type::TEMPERATURE: return "temperature";
|
||||||
|
default : return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||||
|
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
||||||
|
{"top_k", llama_sampler_type::TOP_K},
|
||||||
|
{"top_p", llama_sampler_type::TOP_P},
|
||||||
|
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"min_p", llama_sampler_type::MIN_P},
|
||||||
|
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||||
|
{"temperature", llama_sampler_type::TEMPERATURE}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_sampler {
|
// since samplers names are written multiple ways
|
||||||
common_params_sampling params;
|
// make it ready for both system names and input names
|
||||||
|
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
|
||||||
|
{"top-k", llama_sampler_type::TOP_K},
|
||||||
|
{"top-p", llama_sampler_type::TOP_P},
|
||||||
|
{"nucleus", llama_sampler_type::TOP_P},
|
||||||
|
{"typical-p", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"typical", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"min-p", llama_sampler_type::MIN_P},
|
||||||
|
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||||
|
{"tfs", llama_sampler_type::TFS_Z},
|
||||||
|
{"temp", llama_sampler_type::TEMPERATURE}
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
struct llama_sampler * chain;
|
sampler_types.reserve(names.size());
|
||||||
|
for (const auto & name : names)
|
||||||
|
{
|
||||||
|
auto sampler_item = sampler_canonical_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_canonical_name_map.end())
|
||||||
|
{
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (allow_alt_names)
|
||||||
|
{
|
||||||
|
sampler_item = sampler_alt_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_alt_name_map.end())
|
||||||
|
{
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
ring_buffer<llama_token> prev;
|
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
|
||||||
|
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
||||||
|
{'k', llama_sampler_type::TOP_K},
|
||||||
|
{'p', llama_sampler_type::TOP_P},
|
||||||
|
{'y', llama_sampler_type::TYPICAL_P},
|
||||||
|
{'m', llama_sampler_type::MIN_P},
|
||||||
|
{'f', llama_sampler_type::TFS_Z},
|
||||||
|
{'t', llama_sampler_type::TEMPERATURE}
|
||||||
|
};
|
||||||
|
|
||||||
std::vector<llama_token_data> cur;
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
|
sampler_types.reserve(names_string.size());
|
||||||
|
for (const auto & c : names_string) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(c);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
llama_token_data_array cur_p;
|
// no reasons to expose this function in header
|
||||||
|
static void sampler_queue(
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
const llama_sampling_params & params,
|
||||||
|
llama_token_data_array & cur_p,
|
||||||
|
size_t min_keep) {
|
||||||
|
const float temp = params.temp;
|
||||||
|
const float dynatemp_range = params.dynatemp_range;
|
||||||
|
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||||
|
const int32_t top_k = params.top_k;
|
||||||
|
const float top_p = params.top_p;
|
||||||
|
const float min_p = params.min_p;
|
||||||
|
const float tfs_z = params.tfs_z;
|
||||||
|
const float typical_p = params.typical_p;
|
||||||
|
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
||||||
|
|
||||||
void set_logits(struct llama_context * ctx, int idx) {
|
for (auto sampler_type : samplers_sequence) {
|
||||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
switch (sampler_type) {
|
||||||
|
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
||||||
|
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
||||||
|
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||||
|
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||||
|
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||||
|
case llama_sampler_type::TEMPERATURE:
|
||||||
|
if (dynatemp_range > 0) {
|
||||||
|
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||||
|
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||||
|
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
||||||
|
} else {
|
||||||
|
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default : break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const llama_model * model = llama_get_model(ctx);
|
static llama_token llama_sampling_sample_impl(
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
const int idx,
|
||||||
|
bool is_resampling) {
|
||||||
|
const llama_sampling_params & params = ctx_sampling->params;
|
||||||
|
|
||||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
const float temp = params.temp;
|
||||||
|
const int mirostat = params.mirostat;
|
||||||
|
const float mirostat_tau = params.mirostat_tau;
|
||||||
|
const float mirostat_eta = params.mirostat_eta;
|
||||||
|
|
||||||
|
std::vector<float> original_logits;
|
||||||
|
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
|
||||||
|
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
||||||
|
GGML_ASSERT(!original_logits.empty());
|
||||||
|
}
|
||||||
|
llama_token id = 0;
|
||||||
|
|
||||||
|
if (temp < 0.0) {
|
||||||
|
// greedy sampling, with probs
|
||||||
|
llama_sample_softmax(ctx_main, &cur_p);
|
||||||
|
id = cur_p.data[0].id;
|
||||||
|
} else if (temp == 0.0) {
|
||||||
|
// greedy sampling, no probs
|
||||||
|
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
||||||
|
} else {
|
||||||
|
if (mirostat == 1) {
|
||||||
|
const int mirostat_m = 100;
|
||||||
|
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||||
|
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
|
||||||
|
} else if (mirostat == 2) {
|
||||||
|
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||||
|
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
||||||
|
} else {
|
||||||
|
// temperature sampling
|
||||||
|
size_t min_keep = std::max(1, params.min_keep);
|
||||||
|
|
||||||
|
sampler_queue(ctx_main, params, cur_p, min_keep);
|
||||||
|
|
||||||
|
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
|
||||||
|
|
||||||
|
//{
|
||||||
|
// const int n_top = 10;
|
||||||
|
// LOG("top %d candidates:\n", n_top);
|
||||||
|
|
||||||
|
// for (int i = 0; i < n_top; i++) {
|
||||||
|
// const llama_token id = cur_p.data[i].id;
|
||||||
|
// (void)id; // To avoid a warning that id is unused when logging is disabled.
|
||||||
|
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
|
||||||
|
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
||||||
|
// Get a pointer to the logits
|
||||||
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||||
|
|
||||||
|
// Create an array with a single token data element for the sampled id
|
||||||
|
llama_token_data single_token_data = {id, logits[id], 0.0f};
|
||||||
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
||||||
|
|
||||||
|
// Apply grammar constraints to the single token
|
||||||
|
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
|
||||||
|
|
||||||
|
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
||||||
|
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||||
|
|
||||||
|
// If the token is not valid according to the grammar, perform resampling
|
||||||
|
if (!is_valid) {
|
||||||
|
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||||
|
|
||||||
|
// Restore logits from the copy
|
||||||
|
std::copy(original_logits.begin(), original_logits.end(), logits);
|
||||||
|
|
||||||
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
|
||||||
|
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
static llama_token_data_array llama_sampling_prepare_impl(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
const int idx,
|
||||||
|
bool apply_grammar,
|
||||||
|
std::vector<float> * original_logits) {
|
||||||
|
const llama_sampling_params & params = ctx_sampling->params;
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||||
|
|
||||||
|
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||||
|
const float penalty_repeat = params.penalty_repeat;
|
||||||
|
const float penalty_freq = params.penalty_freq;
|
||||||
|
const float penalty_present = params.penalty_present;
|
||||||
|
|
||||||
|
const bool penalize_nl = params.penalize_nl;
|
||||||
|
|
||||||
|
auto & prev = ctx_sampling->prev;
|
||||||
|
auto & cur = ctx_sampling->cur;
|
||||||
|
|
||||||
|
// Get a pointer to the logits
|
||||||
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||||
|
|
||||||
|
if (ctx_sampling->grammar != NULL && !apply_grammar) {
|
||||||
|
GGML_ASSERT(original_logits != NULL);
|
||||||
|
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
||||||
|
*original_logits = {logits, logits + n_vocab};
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply params.logit_bias map
|
||||||
|
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||||
|
logits[it->first] += it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx_cfg) {
|
||||||
|
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
|
||||||
|
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
|
||||||
|
}
|
||||||
|
|
||||||
cur.resize(n_vocab);
|
cur.resize(n_vocab);
|
||||||
|
|
||||||
|
@ -124,403 +397,64 @@ struct common_sampler {
|
||||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||||
}
|
}
|
||||||
|
|
||||||
cur_p = { cur.data(), cur.size(), -1, false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string common_params_sampling::print() const {
|
// apply penalties
|
||||||
char result[1024];
|
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
||||||
|
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
||||||
|
if (penalty_tokens_used_size) {
|
||||||
|
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
|
||||||
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
|
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
|
||||||
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
|
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
|
||||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
|
||||||
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
|
|
||||||
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
|
|
||||||
mirostat, mirostat_eta, mirostat_tau);
|
|
||||||
|
|
||||||
return std::string(result);
|
if (!penalize_nl) {
|
||||||
}
|
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||||
|
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
|
||||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
cur_p.data[idx].logit = nl_logit;
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
||||||
|
|
||||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
|
||||||
|
|
||||||
lparams.no_perf = params.no_perf;
|
|
||||||
|
|
||||||
std::vector<const char *> trigger_words;
|
|
||||||
trigger_words.reserve(params.grammar_trigger_words.size());
|
|
||||||
for (const auto & str : params.grammar_trigger_words) {
|
|
||||||
trigger_words.push_back(str.word.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
|
||||||
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
|
||||||
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
|
||||||
#else
|
|
||||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
||||||
#endif // LLAMA_USE_LLGUIDANCE
|
|
||||||
} else {
|
|
||||||
grmr = params.grammar_lazy
|
|
||||||
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
|
|
||||||
trigger_words.data(), trigger_words.size(),
|
|
||||||
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
|
|
||||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
||||||
}
|
|
||||||
|
|
||||||
auto * result = new common_sampler {
|
|
||||||
/* .params = */ params,
|
|
||||||
/* .grmr = */ grmr,
|
|
||||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
||||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
||||||
/* .cur = */ {},
|
|
||||||
/* .cur_p = */ {},
|
|
||||||
};
|
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain,
|
|
||||||
llama_sampler_init_logit_bias(
|
|
||||||
llama_vocab_n_tokens(vocab),
|
|
||||||
params.logit_bias.size(),
|
|
||||||
params.logit_bias.data()));
|
|
||||||
|
|
||||||
if (params.mirostat == 0) {
|
|
||||||
for (const auto & cnstr : params.samplers) {
|
|
||||||
switch (cnstr) {
|
|
||||||
case COMMON_SAMPLER_TYPE_DRY:
|
|
||||||
{
|
|
||||||
std::vector<const char *> c_breakers;
|
|
||||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
||||||
for (const auto & str : params.dry_sequence_breakers) {
|
|
||||||
c_breakers.push_back(str.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_XTC:
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_INFILL:
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
||||||
} else if (params.mirostat == 1) {
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
|
||||||
} else if (params.mirostat == 2) {
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false && "unknown mirostat version");
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
|
||||||
if (gsmpl) {
|
|
||||||
llama_sampler_free(gsmpl->grmr);
|
|
||||||
|
|
||||||
llama_sampler_free(gsmpl->chain);
|
|
||||||
|
|
||||||
delete gsmpl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
|
||||||
if (accept_grammar) {
|
|
||||||
llama_sampler_accept(gsmpl->grmr, token);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler_accept(gsmpl->chain, token);
|
|
||||||
|
|
||||||
gsmpl->prev.push_back(token);
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_sampler_reset(struct common_sampler * gsmpl) {
|
|
||||||
llama_sampler_reset(gsmpl->grmr);
|
|
||||||
|
|
||||||
llama_sampler_reset(gsmpl->chain);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
||||||
return new common_sampler {
|
|
||||||
/* .params = */ gsmpl->params,
|
|
||||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
||||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
|
||||||
/* .prev = */ gsmpl->prev,
|
|
||||||
/* .cur = */ gsmpl->cur,
|
|
||||||
/* .cur_p = */ gsmpl->cur_p,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
|
||||||
// TODO: measure grammar performance
|
|
||||||
|
|
||||||
if (gsmpl) {
|
|
||||||
llama_perf_sampler_print(gsmpl->chain);
|
|
||||||
}
|
|
||||||
if (ctx) {
|
|
||||||
llama_perf_context_print(ctx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
|
||||||
gsmpl->set_logits(ctx, idx);
|
|
||||||
|
|
||||||
auto & grmr = gsmpl->grmr;
|
|
||||||
auto & chain = gsmpl->chain;
|
|
||||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
|
||||||
|
|
||||||
if (grammar_first) {
|
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
|
||||||
|
|
||||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
|
||||||
|
|
||||||
if (grammar_first) {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if it the sampled token fits the grammar
|
|
||||||
{
|
|
||||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
||||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &single_token_data_array);
|
|
||||||
|
|
||||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
||||||
if (is_valid) {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// resampling:
|
|
||||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
|
||||||
gsmpl->set_logits(ctx, idx);
|
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
|
||||||
|
|
||||||
return cur_p.data[cur_p.selected].id;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
|
||||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
|
||||||
|
|
||||||
std::vector<llama_token> result;
|
|
||||||
result.reserve(idxs.size());
|
|
||||||
|
|
||||||
size_t i = 0;
|
|
||||||
for (; i < draft.size(); i++) {
|
|
||||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
|
||||||
|
|
||||||
common_sampler_accept(gsmpl, id, true);
|
|
||||||
|
|
||||||
result.push_back(id);
|
|
||||||
|
|
||||||
if (draft[i] != id) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == draft.size()) {
|
|
||||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
|
||||||
|
|
||||||
common_sampler_accept(gsmpl, id, true);
|
|
||||||
|
|
||||||
result.push_back(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
|
||||||
std::vector<int> idxs(draft.size() + 1);
|
|
||||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
||||||
idxs[i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
||||||
return llama_sampler_get_seed(gsmpl->chain);
|
|
||||||
}
|
|
||||||
|
|
||||||
// helpers
|
|
||||||
|
|
||||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
|
||||||
return &gsmpl->cur_p;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
|
||||||
return gsmpl->prev.rat(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
|
||||||
std::string result = "logits ";
|
|
||||||
|
|
||||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
|
||||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
|
||||||
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
|
|
||||||
n = std::min(n, (int) gsmpl->prev.size());
|
|
||||||
|
|
||||||
if (n <= 0) {
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string result;
|
|
||||||
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
|
||||||
|
|
||||||
for (int i = n - 1; i >= 0; i--) {
|
|
||||||
const llama_token id = gsmpl->prev.rat(i);
|
|
||||||
|
|
||||||
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
|
||||||
|
|
||||||
result += common_token_to_piece(ctx_main, id);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
|
||||||
switch (cnstr) {
|
|
||||||
case COMMON_SAMPLER_TYPE_DRY: return 'd';
|
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
|
||||||
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
|
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
|
|
||||||
default : return '?';
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
// apply grammar checks before sampling logic
|
||||||
switch (cnstr) {
|
if (apply_grammar && ctx_sampling->grammar != NULL) {
|
||||||
case COMMON_SAMPLER_TYPE_DRY: return "dry";
|
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
|
||||||
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
|
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
|
|
||||||
default : return "";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
return cur_p;
|
||||||
std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
|
|
||||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
|
||||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
|
||||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
|
||||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
||||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
|
||||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
||||||
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
|
||||||
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
|
||||||
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
|
|
||||||
};
|
|
||||||
|
|
||||||
// since samplers names are written multiple ways
|
|
||||||
// make it ready for both system names and input names
|
|
||||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
|
||||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
|
||||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
|
||||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
|
||||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
||||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
||||||
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
||||||
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
||||||
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
|
|
||||||
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<common_sampler_type> samplers;
|
|
||||||
samplers.reserve(names.size());
|
|
||||||
|
|
||||||
for (const auto & name : names) {
|
|
||||||
auto sampler = sampler_canonical_name_map.find(name);
|
|
||||||
if (sampler != sampler_canonical_name_map.end()) {
|
|
||||||
samplers.push_back(sampler->second);
|
|
||||||
} else {
|
|
||||||
if (allow_alt_names) {
|
|
||||||
sampler = sampler_alt_name_map.find(name);
|
|
||||||
if (sampler != sampler_alt_name_map.end()) {
|
|
||||||
samplers.push_back(sampler->second);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return samplers;
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
const int idx) {
|
||||||
|
// Call the implementation function with is_resampling set to false by default
|
||||||
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
|
llama_token_data_array llama_sampling_prepare(
|
||||||
std::unordered_map<char, common_sampler_type> sampler_name_map = {
|
struct llama_sampling_context * ctx_sampling,
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
|
struct llama_context * ctx_main,
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
struct llama_context * ctx_cfg,
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
const int idx,
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
bool apply_grammar,
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
std::vector<float> * original_logits) {
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
|
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<common_sampler_type> samplers;
|
|
||||||
samplers.reserve(chars.size());
|
|
||||||
|
|
||||||
for (const auto & c : chars) {
|
|
||||||
const auto sampler = sampler_name_map.find(c);
|
|
||||||
if (sampler != sampler_name_map.end()) {
|
|
||||||
samplers.push_back(sampler->second);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return samplers;
|
void llama_sampling_accept(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
llama_token id,
|
||||||
|
bool apply_grammar) {
|
||||||
|
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||||
|
ctx_sampling->prev.push_back(id);
|
||||||
|
|
||||||
|
if (ctx_sampling->grammar != NULL && apply_grammar) {
|
||||||
|
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,106 +2,159 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include "common.h"
|
#include "grammar-parser.h"
|
||||||
|
|
||||||
|
#include <random>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// common_sampler extends llama_sampler with additional functionality:
|
// sampler types
|
||||||
|
enum class llama_sampler_type : char {
|
||||||
|
TOP_K = 'k',
|
||||||
|
TOP_P = 'p',
|
||||||
|
MIN_P = 'm',
|
||||||
|
TFS_Z = 'f',
|
||||||
|
TYPICAL_P = 'y',
|
||||||
|
TEMPERATURE = 't'
|
||||||
|
};
|
||||||
|
|
||||||
|
// sampling parameters
|
||||||
|
typedef struct llama_sampling_params {
|
||||||
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||||
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||||
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> samplers_sequence = {
|
||||||
|
llama_sampler_type::TOP_K,
|
||||||
|
llama_sampler_type::TFS_Z,
|
||||||
|
llama_sampler_type::TYPICAL_P,
|
||||||
|
llama_sampler_type::TOP_P,
|
||||||
|
llama_sampler_type::MIN_P,
|
||||||
|
llama_sampler_type::TEMPERATURE
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
// Classifier-Free Guidance
|
||||||
|
// https://arxiv.org/abs/2306.17806
|
||||||
|
std::string cfg_negative_prompt; // string to help guidance
|
||||||
|
float cfg_scale = 1.f; // how strong is guidance
|
||||||
|
|
||||||
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
||||||
|
std::vector<llama_token> penalty_prompt_tokens;
|
||||||
|
bool use_penalty_prompt_tokens = false;
|
||||||
|
} llama_sampling_params;
|
||||||
|
|
||||||
|
// general sampler context
|
||||||
|
// TODO: move to llama.h
|
||||||
|
struct llama_sampling_context {
|
||||||
|
// parameters that will be used for sampling
|
||||||
|
llama_sampling_params params;
|
||||||
|
|
||||||
|
// mirostat sampler state
|
||||||
|
float mirostat_mu;
|
||||||
|
|
||||||
|
llama_grammar * grammar;
|
||||||
|
|
||||||
|
// internal
|
||||||
|
grammar_parser::parse_state parsed_grammar;
|
||||||
|
|
||||||
|
// TODO: replace with ring-buffer
|
||||||
|
std::vector<llama_token> prev;
|
||||||
|
std::vector<llama_token_data> cur;
|
||||||
|
size_t n_valid; // Number of correct top tokens with correct probabilities.
|
||||||
|
|
||||||
|
std::mt19937 rng;
|
||||||
|
};
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
// Create a new sampling context instance.
|
||||||
|
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
|
||||||
|
|
||||||
|
void llama_sampling_free(struct llama_sampling_context * ctx);
|
||||||
|
|
||||||
|
// Reset the sampler context
|
||||||
|
// - clear prev tokens
|
||||||
|
// - reset grammar
|
||||||
|
void llama_sampling_reset(llama_sampling_context * ctx);
|
||||||
|
|
||||||
|
// Set the sampler seed
|
||||||
|
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
|
||||||
|
|
||||||
|
// Copy the sampler context
|
||||||
|
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
||||||
|
|
||||||
|
// Get the last sampled token
|
||||||
|
llama_token llama_sampling_last(llama_sampling_context * ctx);
|
||||||
|
|
||||||
|
// Get a string representation of the last sampled tokens
|
||||||
|
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
|
||||||
|
|
||||||
|
// Print sampling parameters into a string
|
||||||
|
std::string llama_sampling_print(const llama_sampling_params & params);
|
||||||
|
|
||||||
|
// Print sampling order into a string
|
||||||
|
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
||||||
|
|
||||||
|
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
|
||||||
|
|
||||||
|
// this is a common sampling function used across the examples for convenience
|
||||||
|
// it can serve as a starting point for implementing your own sampling function
|
||||||
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||||
|
// llama_sampling_reset when a sequence ends
|
||||||
//
|
//
|
||||||
// - grammar support
|
// required:
|
||||||
// - custom sampler logic based on the parameters
|
// - ctx_main: context to use for sampling
|
||||||
// - history of the last accepted tokens
|
// - ctx_sampling: sampling-specific context
|
||||||
// - performance metrics
|
|
||||||
//
|
//
|
||||||
// This goal is to have a common implementation of the sampling logic shared across the examples.
|
// optional:
|
||||||
// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
|
// - ctx_cfg: context to use for classifier-free guidance
|
||||||
// complex (top-k, top-p, etc).
|
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
||||||
//
|
//
|
||||||
// Another example is related to the grammar. In general, the grammar constraints applied on the full
|
// returns:
|
||||||
// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
|
// - token: sampled token
|
||||||
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
|
// - candidates: vector of candidate tokens
|
||||||
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
|
||||||
//
|
|
||||||
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
|
|
||||||
// be moved into the core llama library.
|
|
||||||
//
|
|
||||||
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
|
|
||||||
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
|
||||||
//
|
|
||||||
// TODO: measure grammar performance
|
|
||||||
//
|
//
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
int idx = -1);
|
||||||
|
|
||||||
struct common_sampler;
|
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
|
||||||
|
llama_token_data_array llama_sampling_prepare(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
int idx = 0,
|
||||||
|
bool apply_grammar = true,
|
||||||
|
std::vector<float> * original_logits = nullptr);
|
||||||
|
|
||||||
// llama_sampler API overloads
|
void llama_sampling_accept(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
|
struct llama_context * ctx_main,
|
||||||
|
llama_token id,
|
||||||
void common_sampler_free(struct common_sampler * gsmpl);
|
bool apply_grammar);
|
||||||
|
|
||||||
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
|
||||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
|
|
||||||
void common_sampler_reset (struct common_sampler * gsmpl);
|
|
||||||
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|
||||||
|
|
||||||
// arguments can be nullptr to skip printing
|
|
||||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
|
||||||
|
|
||||||
// extended sampling implementation:
|
|
||||||
//
|
|
||||||
// - set logits
|
|
||||||
// - apply the configured sampler chain
|
|
||||||
// - check if the token fits the grammar (if any)
|
|
||||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
|
||||||
//
|
|
||||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
|
||||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
||||||
//
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
||||||
|
|
||||||
// generalized version of common_sampler_sample
|
|
||||||
//
|
|
||||||
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
|
|
||||||
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
|
|
||||||
//
|
|
||||||
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
|
|
||||||
//
|
|
||||||
// is equivalent to
|
|
||||||
//
|
|
||||||
// common_sampler_sample(gsmpl, ctx, idx);
|
|
||||||
// common_sampler_accept(gsmpl, token, true);
|
|
||||||
//
|
|
||||||
// requires: idxs.size() == draft.size() + 1
|
|
||||||
//
|
|
||||||
// returns at least 1 token, up to idxs.size()
|
|
||||||
//
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
|
||||||
|
|
||||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
||||||
|
|
||||||
// helpers
|
|
||||||
|
|
||||||
// access the internal list of current candidate tokens
|
|
||||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
|
||||||
|
|
||||||
// get the last accepted token
|
|
||||||
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
|
||||||
|
|
||||||
// print the sampler chain into a string
|
|
||||||
std::string common_sampler_print(const struct common_sampler * gsmpl);
|
|
||||||
|
|
||||||
// get a string representation of the last accepted tokens
|
|
||||||
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
|
|
||||||
|
|
||||||
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
|
|
||||||
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
|
||||||
|
|
||||||
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
||||||
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
|
|
||||||
|
|
||||||
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
|
||||||
const char * grammar_kind, const char * grammar_data);
|
|
||||||
|
|
|
@ -1,277 +0,0 @@
|
||||||
#include "speculative.h"
|
|
||||||
|
|
||||||
#include "log.h"
|
|
||||||
#include "common.h"
|
|
||||||
#include "sampling.h"
|
|
||||||
|
|
||||||
#include <cstring>
|
|
||||||
|
|
||||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
|
||||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
|
||||||
|
|
||||||
struct common_speculative {
|
|
||||||
struct llama_context * ctx;
|
|
||||||
struct common_sampler * smpl;
|
|
||||||
|
|
||||||
llama_batch batch;
|
|
||||||
llama_tokens prompt;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_speculative * common_speculative_init(
|
|
||||||
struct llama_context * ctx_dft) {
|
|
||||||
auto * result = new common_speculative {
|
|
||||||
/* .ctx = */ ctx_dft,
|
|
||||||
/* .smpl = */ nullptr,
|
|
||||||
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
|
||||||
/* .prompt = */ {},
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: optimize or pass from outside?
|
|
||||||
#if 0
|
|
||||||
{
|
|
||||||
common_params_sampling params;
|
|
||||||
params.no_perf = false;
|
|
||||||
|
|
||||||
params.top_k = 40;
|
|
||||||
params.top_p = 0.9;
|
|
||||||
|
|
||||||
params.samplers = {
|
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
|
||||||
COMMON_SAMPLER_TYPE_TOP_P,
|
|
||||||
COMMON_SAMPLER_TYPE_INFILL,
|
|
||||||
};
|
|
||||||
|
|
||||||
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
{
|
|
||||||
common_params_sampling params;
|
|
||||||
params.no_perf = false;
|
|
||||||
|
|
||||||
params.top_k = 10;
|
|
||||||
|
|
||||||
params.samplers = {
|
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
|
||||||
};
|
|
||||||
|
|
||||||
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void common_speculative_free(struct common_speculative * spec) {
|
|
||||||
if (spec == nullptr) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
common_sampler_free(spec->smpl);
|
|
||||||
|
|
||||||
llama_batch_free(spec->batch);
|
|
||||||
|
|
||||||
delete spec;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool common_speculative_are_compatible(
|
|
||||||
const struct llama_context * ctx_tgt,
|
|
||||||
const struct llama_context * ctx_dft) {
|
|
||||||
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
|
|
||||||
const struct llama_model * model_dft = llama_get_model(ctx_dft);
|
|
||||||
|
|
||||||
const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
|
|
||||||
const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
|
|
||||||
|
|
||||||
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
|
|
||||||
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
|
|
||||||
|
|
||||||
const bool vocab_type_dft = llama_vocab_type(vocab_dft);
|
|
||||||
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
|
||||||
|
|
||||||
if (vocab_type_tgt != vocab_type_dft) {
|
|
||||||
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
|
|
||||||
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
|
||||||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
|
||||||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
|
||||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
|
|
||||||
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
|
|
||||||
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
|
|
||||||
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
|
|
||||||
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
|
|
||||||
|
|
||||||
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
|
|
||||||
|
|
||||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
|
||||||
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
|
|
||||||
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
|
||||||
__func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
|
||||||
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
|
|
||||||
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
|
|
||||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
|
||||||
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
|
|
||||||
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
|
|
||||||
common_token_to_piece(ctx_tgt, i).c_str(),
|
|
||||||
common_token_to_piece(ctx_dft, i).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_tokens common_speculative_gen_draft(
|
|
||||||
struct common_speculative * spec,
|
|
||||||
struct common_speculative_params params,
|
|
||||||
const llama_tokens & prompt_tgt,
|
|
||||||
llama_token id_last) {
|
|
||||||
auto & batch = spec->batch;
|
|
||||||
auto & ctx = spec->ctx;
|
|
||||||
auto & smpl = spec->smpl;
|
|
||||||
auto & prompt = spec->prompt;
|
|
||||||
|
|
||||||
int reuse_i = 0;
|
|
||||||
int reuse_n = 0;
|
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
|
|
||||||
|
|
||||||
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
|
||||||
|
|
||||||
// reuse as much as possible from the old draft context
|
|
||||||
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
|
||||||
for (int i = 0; i < (int) prompt.size(); ++i) {
|
|
||||||
int cur = 0;
|
|
||||||
while (i_start + cur < (int) prompt_tgt.size() &&
|
|
||||||
i + cur < (int) prompt.size() &&
|
|
||||||
prompt_tgt[i_start + cur] == prompt[i + cur]) {
|
|
||||||
cur++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
|
|
||||||
reuse_i = i;
|
|
||||||
reuse_n = cur;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
|
|
||||||
|
|
||||||
llama_tokens result;
|
|
||||||
result.reserve(params.n_draft);
|
|
||||||
|
|
||||||
if (reuse_n == 0) {
|
|
||||||
llama_kv_cache_clear(ctx);
|
|
||||||
|
|
||||||
prompt.clear();
|
|
||||||
} else {
|
|
||||||
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
|
||||||
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
|
||||||
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
|
|
||||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
|
|
||||||
result.push_back(prompt[i]);
|
|
||||||
|
|
||||||
if (params.n_draft <= (int) result.size()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (reuse_i > 0) {
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
|
|
||||||
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
|
||||||
|
|
||||||
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (reuse_n < (int) prompt.size()) {
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
|
|
||||||
|
|
||||||
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// prepare a batch to evaluate any new tokens in the prompt
|
|
||||||
common_batch_clear(batch);
|
|
||||||
|
|
||||||
for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
|
|
||||||
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
|
||||||
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
|
||||||
|
|
||||||
prompt.push_back(prompt_tgt[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// we should rarely end-up here during normal decoding
|
|
||||||
if (batch.n_tokens > 0) {
|
|
||||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
|
||||||
|
|
||||||
llama_decode(ctx, batch);
|
|
||||||
}
|
|
||||||
|
|
||||||
const llama_pos n_past = prompt.size();
|
|
||||||
|
|
||||||
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
|
||||||
|
|
||||||
common_batch_clear(batch);
|
|
||||||
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
|
||||||
|
|
||||||
prompt.push_back(id_last);
|
|
||||||
|
|
||||||
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
|
|
||||||
|
|
||||||
llama_decode(ctx, batch);
|
|
||||||
|
|
||||||
common_sampler_reset(smpl);
|
|
||||||
|
|
||||||
// sample n_draft tokens from the draft model
|
|
||||||
for (int i = 0; i < params.n_draft; ++i) {
|
|
||||||
common_batch_clear(batch);
|
|
||||||
|
|
||||||
common_sampler_sample(smpl, ctx, 0, true);
|
|
||||||
|
|
||||||
const auto * cur_p = common_sampler_get_candidates(smpl);
|
|
||||||
|
|
||||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
|
||||||
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
|
||||||
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// add drafted token for each sequence
|
|
||||||
const llama_token id = cur_p->data[0].id;
|
|
||||||
|
|
||||||
// only collect very high-confidence draft tokens
|
|
||||||
if (cur_p->data[0].p < params.p_min) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
common_sampler_accept(smpl, id, true);
|
|
||||||
|
|
||||||
result.push_back(id);
|
|
||||||
|
|
||||||
if (params.n_draft <= (int) result.size()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
|
||||||
|
|
||||||
// evaluate the drafted tokens on the draft model
|
|
||||||
llama_decode(ctx, batch);
|
|
||||||
|
|
||||||
prompt.push_back(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
|
@ -1,28 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
struct common_speculative;
|
|
||||||
|
|
||||||
struct common_speculative_params {
|
|
||||||
int n_draft = 16; // max drafted tokens
|
|
||||||
int n_reuse = 256;
|
|
||||||
|
|
||||||
float p_min = 0.9f; // min probabiliy required to accept a token in the draft
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
|
||||||
|
|
||||||
void common_speculative_free(struct common_speculative * spec);
|
|
||||||
|
|
||||||
bool common_speculative_are_compatible(
|
|
||||||
const struct llama_context * ctx_tgt,
|
|
||||||
const struct llama_context * ctx_dft);
|
|
||||||
|
|
||||||
// sample up to n_draft tokens and add them to the batch using the draft model
|
|
||||||
llama_tokens common_speculative_gen_draft(
|
|
||||||
struct common_speculative * spec,
|
|
||||||
struct common_speculative_params params,
|
|
||||||
const llama_tokens & prompt,
|
|
||||||
llama_token id_last);
|
|
2972
common/stb_image.h
2972
common/stb_image.h
File diff suppressed because it is too large
Load diff
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue