Compare commits
1 commit
master
...
compilade/
Author | SHA1 | Date | |
---|---|---|---|
|
9a424a3872 |
1187 changed files with 200879 additions and 308937 deletions
161
.clang-format
161
.clang-format
|
@ -1,161 +0,0 @@
|
||||||
---
|
|
||||||
Language: Cpp
|
|
||||||
AlignAfterOpenBracket: Align
|
|
||||||
AlignArrayOfStructures: Left
|
|
||||||
AlignConsecutiveAssignments: AcrossComments
|
|
||||||
AlignConsecutiveBitFields: AcrossComments
|
|
||||||
AlignConsecutiveDeclarations: AcrossComments
|
|
||||||
AlignConsecutiveMacros: AcrossComments
|
|
||||||
# AlignConsecutiveShortCaseStatements: AcrossComments
|
|
||||||
AlignEscapedNewlines: Left # LeftWithLastLine
|
|
||||||
AlignOperands: Align
|
|
||||||
AlignTrailingComments:
|
|
||||||
Kind: Always
|
|
||||||
OverEmptyLines: 1
|
|
||||||
AllowAllArgumentsOnNextLine: true
|
|
||||||
AllowAllParametersOfDeclarationOnNextLine: false
|
|
||||||
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
|
|
||||||
AllowShortBlocksOnASingleLine: Never
|
|
||||||
AllowShortCaseLabelsOnASingleLine: false
|
|
||||||
AllowShortFunctionsOnASingleLine: Inline
|
|
||||||
AllowShortIfStatementsOnASingleLine: Never
|
|
||||||
AllowShortLambdasOnASingleLine: Inline
|
|
||||||
AllowShortLoopsOnASingleLine: false
|
|
||||||
AlwaysBreakBeforeMultilineStrings: true
|
|
||||||
BinPackArguments: true
|
|
||||||
BinPackParameters: true # OnePerLine
|
|
||||||
BitFieldColonSpacing: Both
|
|
||||||
BreakBeforeBraces: Custom # Attach
|
|
||||||
BraceWrapping:
|
|
||||||
AfterCaseLabel: true
|
|
||||||
AfterClass: false
|
|
||||||
AfterControlStatement: false
|
|
||||||
AfterEnum: false
|
|
||||||
AfterFunction: false
|
|
||||||
AfterNamespace: false
|
|
||||||
AfterObjCDeclaration: false
|
|
||||||
AfterStruct: false
|
|
||||||
AfterUnion: false
|
|
||||||
AfterExternBlock: false
|
|
||||||
BeforeCatch: false
|
|
||||||
BeforeElse: false
|
|
||||||
BeforeLambdaBody: false
|
|
||||||
BeforeWhile: false
|
|
||||||
IndentBraces: false
|
|
||||||
SplitEmptyFunction: false
|
|
||||||
SplitEmptyRecord: false
|
|
||||||
SplitEmptyNamespace: false
|
|
||||||
# BreakAdjacentStringLiterals: true
|
|
||||||
BreakAfterAttributes: Never
|
|
||||||
BreakBeforeBinaryOperators: None
|
|
||||||
BreakBeforeInlineASMColon: OnlyMultiline
|
|
||||||
BreakBeforeTernaryOperators: false
|
|
||||||
# BreakBinaryOperations: Never
|
|
||||||
BreakConstructorInitializers: AfterColon
|
|
||||||
# BreakFunctionDefinitionParameters: false
|
|
||||||
BreakInheritanceList: AfterComma
|
|
||||||
BreakStringLiterals: true
|
|
||||||
# BreakTemplateDeclarations: Yes
|
|
||||||
ColumnLimit: 120
|
|
||||||
CommentPragmas: '^ IWYU pragma:'
|
|
||||||
CompactNamespaces: false
|
|
||||||
ConstructorInitializerIndentWidth: 4
|
|
||||||
ContinuationIndentWidth: 4
|
|
||||||
Cpp11BracedListStyle: false
|
|
||||||
DerivePointerAlignment: false
|
|
||||||
DisableFormat: false
|
|
||||||
EmptyLineBeforeAccessModifier: Leave
|
|
||||||
EmptyLineAfterAccessModifier: Never
|
|
||||||
ExperimentalAutoDetectBinPacking: false
|
|
||||||
FixNamespaceComments: true
|
|
||||||
IncludeBlocks: Regroup
|
|
||||||
IncludeCategories:
|
|
||||||
- Regex: '^<.*\.h>'
|
|
||||||
Priority: 1
|
|
||||||
SortPriority: 0
|
|
||||||
- Regex: '^<.*'
|
|
||||||
Priority: 2
|
|
||||||
SortPriority: 0
|
|
||||||
- Regex: '.*'
|
|
||||||
Priority: 3
|
|
||||||
SortPriority: 0
|
|
||||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
|
||||||
IncludeIsMainSourceRegex: ''
|
|
||||||
IndentAccessModifiers: false
|
|
||||||
IndentCaseBlocks: true
|
|
||||||
IndentCaseLabels: true
|
|
||||||
IndentExternBlock: NoIndent
|
|
||||||
IndentGotoLabels: false
|
|
||||||
IndentPPDirectives: AfterHash
|
|
||||||
IndentWidth: 4
|
|
||||||
IndentWrappedFunctionNames: false
|
|
||||||
InsertBraces: true # NOTE: may lead to incorrect formatting
|
|
||||||
InsertNewlineAtEOF: true
|
|
||||||
JavaScriptQuotes: Leave
|
|
||||||
JavaScriptWrapImports: true
|
|
||||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
|
||||||
LambdaBodyIndentation: Signature
|
|
||||||
LineEnding: LF
|
|
||||||
MacroBlockBegin: ''
|
|
||||||
MacroBlockEnd: ''
|
|
||||||
MaxEmptyLinesToKeep: 1
|
|
||||||
NamespaceIndentation: None
|
|
||||||
ObjCBinPackProtocolList: Auto
|
|
||||||
ObjCBlockIndentWidth: 4
|
|
||||||
ObjCSpaceAfterProperty: true
|
|
||||||
ObjCSpaceBeforeProtocolList: true
|
|
||||||
PPIndentWidth: -1
|
|
||||||
PackConstructorInitializers: CurrentLine
|
|
||||||
PenaltyBreakAssignment: 2
|
|
||||||
PenaltyBreakBeforeFirstCallParameter: 1
|
|
||||||
PenaltyBreakComment: 300
|
|
||||||
PenaltyBreakFirstLessLess: 120
|
|
||||||
PenaltyBreakString: 1000
|
|
||||||
PenaltyBreakTemplateDeclaration: 10
|
|
||||||
PenaltyExcessCharacter: 1000000
|
|
||||||
PenaltyReturnTypeOnItsOwnLine: 200
|
|
||||||
PointerAlignment: Middle
|
|
||||||
QualifierAlignment: Left
|
|
||||||
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
|
|
||||||
RawStringFormats:
|
|
||||||
- Language: Cpp
|
|
||||||
Delimiters:
|
|
||||||
- cc
|
|
||||||
- CC
|
|
||||||
- cpp
|
|
||||||
- Cpp
|
|
||||||
- CPP
|
|
||||||
- 'c++'
|
|
||||||
- 'C++'
|
|
||||||
CanonicalDelimiter: ''
|
|
||||||
ReferenceAlignment: Middle
|
|
||||||
ReflowComments: false # IndentOnly
|
|
||||||
SeparateDefinitionBlocks: Always
|
|
||||||
SortIncludes: CaseInsensitive
|
|
||||||
SortUsingDeclarations: LexicographicNumeric
|
|
||||||
SpaceAfterCStyleCast: true
|
|
||||||
SpaceAfterLogicalNot: false
|
|
||||||
SpaceAfterTemplateKeyword: true
|
|
||||||
SpaceBeforeAssignmentOperators: true
|
|
||||||
SpaceBeforeCpp11BracedList: false
|
|
||||||
SpaceBeforeCtorInitializerColon: true
|
|
||||||
SpaceBeforeInheritanceColon: true
|
|
||||||
SpaceBeforeParens: ControlStatements
|
|
||||||
SpaceBeforeRangeBasedForLoopColon: true
|
|
||||||
SpaceInEmptyBlock: false
|
|
||||||
SpaceInEmptyParentheses: false
|
|
||||||
SpacesBeforeTrailingComments: 2
|
|
||||||
SpacesInAngles: Never
|
|
||||||
SpacesInContainerLiterals: true
|
|
||||||
SpacesInLineCommentPrefix:
|
|
||||||
Minimum: 1
|
|
||||||
Maximum: -1
|
|
||||||
SpacesInParentheses: false
|
|
||||||
SpacesInSquareBrackets: false
|
|
||||||
SpaceBeforeSquareBrackets: false
|
|
||||||
Standard: c++17
|
|
||||||
TabWidth: 4
|
|
||||||
UseTab: Never
|
|
||||||
WhitespaceSensitiveMacros: ['STRINGIZE']
|
|
||||||
...
|
|
||||||
|
|
|
@ -12,15 +12,12 @@ Checks: >
|
||||||
-readability-implicit-bool-conversion,
|
-readability-implicit-bool-conversion,
|
||||||
-readability-magic-numbers,
|
-readability-magic-numbers,
|
||||||
-readability-uppercase-literal-suffix,
|
-readability-uppercase-literal-suffix,
|
||||||
-readability-simplify-boolean-expr,
|
|
||||||
clang-analyzer-*,
|
clang-analyzer-*,
|
||||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||||
performance-*,
|
performance-*,
|
||||||
portability-*,
|
portability-*,
|
||||||
-portability-simd-intrinsics,
|
|
||||||
misc-*,
|
misc-*,
|
||||||
-misc-const-correctness,
|
-misc-const-correctness,
|
||||||
-misc-non-private-member-variables-in-classes,
|
-misc-non-private-member-variables-in-classes,
|
||||||
-misc-no-recursion,
|
-misc-no-recursion,
|
||||||
-misc-use-anonymous-namespace,
|
|
||||||
FormatStyle: none
|
FormatStyle: none
|
||||||
|
|
|
@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
||||||
stage('Running llama.cpp'){
|
stage('Running llama.cpp'){
|
||||||
sh'''#!/bin/bash
|
sh'''#!/bin/bash
|
||||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||||
cat llama_log.txt # Printing results
|
cat llama_log.txt # Printing results
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,92 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
ARG TARGETARCH
|
|
||||||
|
|
||||||
ARG GGML_CPU_ARM_ARCH=armv8-a
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
|
||||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
|
||||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
|
||||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
|
||||||
else \
|
|
||||||
echo "Unsupported architecture"; \
|
|
||||||
exit 1; \
|
|
||||||
fi && \
|
|
||||||
cmake --build build -j $(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -1,94 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
34
.devops/full-cuda.Dockerfile
Normal file
34
.devops/full-cuda.Dockerfile
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip git
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
45
.devops/full-rocm.Dockerfile
Normal file
45
.devops/full-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV LLAMA_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
22
.devops/full.Dockerfile
Normal file
22
.devops/full.Dockerfile
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip git
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,91 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
|
||||||
|
|
||||||
## Build Image
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" \
|
|
||||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with dynamic libs" && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
|
|
||||||
|
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN yum install -y gcc g++ cmake make
|
|
||||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
|
||||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
|
||||||
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
|
|
||||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
|
|
||||||
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
|
||||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
|
||||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
|
|
||||||
# find libascend_hal.so, because the drive hasn`t been mounted.
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
|
||||||
|
|
||||||
RUN echo "Building with static libs" && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
# TODO: use image with NNRT
|
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
|
||||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
|
||||||
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
|
|
||||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
|
|
||||||
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
|
||||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
|
||||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
|
|
||||||
ENTRYPOINT ["/llama-cli" ]
|
|
84
.devops/llama-cpp-clblast.srpm.spec
Normal file
84
.devops/llama-cpp-clblast.srpm.spec
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
# Notes for llama.cpp:
|
||||||
|
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||||
|
# We need to declare standard versioning if people want to sort latest releases.
|
||||||
|
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||||
|
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||||
|
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||||
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
|
Name: llama.cpp-clblast
|
||||||
|
Version: %( date "+%%Y%%m%%d" )
|
||||||
|
Release: 1%{?dist}
|
||||||
|
Summary: OpenCL Inference of LLaMA model in C/C++
|
||||||
|
License: MIT
|
||||||
|
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||||
|
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
|
||||||
|
Requires: clblast
|
||||||
|
URL: https://github.com/ggerganov/llama.cpp
|
||||||
|
|
||||||
|
%define debug_package %{nil}
|
||||||
|
%define source_date_epoch_from_changelog 0
|
||||||
|
|
||||||
|
%description
|
||||||
|
CPU inference for Meta's Lllama2 models using default options.
|
||||||
|
|
||||||
|
%prep
|
||||||
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
|
%build
|
||||||
|
make -j LLAMA_CLBLAST=1
|
||||||
|
|
||||||
|
%install
|
||||||
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
|
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
||||||
|
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
||||||
|
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
||||||
|
|
||||||
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
||||||
|
[Unit]
|
||||||
|
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||||
|
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
|
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
||||||
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
|
Restart=never
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=default.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
mkdir -p %{buildroot}/etc/sysconfig
|
||||||
|
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
||||||
|
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
%clean
|
||||||
|
rm -rf %{buildroot}
|
||||||
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
|
%files
|
||||||
|
%{_bindir}/llamaclblast
|
||||||
|
%{_bindir}/llamaclblastserver
|
||||||
|
%{_bindir}/llamaclblastsimple
|
||||||
|
/usr/lib/systemd/system/llamaclblast.service
|
||||||
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
||||||
|
%pre
|
||||||
|
|
||||||
|
%post
|
||||||
|
|
||||||
|
%preun
|
||||||
|
%postun
|
||||||
|
|
||||||
|
%changelog
|
|
@ -1,5 +1,5 @@
|
||||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
Name: llama.cpp-cuda
|
Name: llama.cpp-cublas
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
Version: %( date "+%%Y%%m%%d" )
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||||
|
@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options.
|
||||||
%setup -n llama.cpp-master
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
%build
|
%build
|
||||||
make -j GGML_CUDA=1
|
make -j LLAMA_CUBLAS=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
cp -p main %{buildroot}%{_bindir}/llamacppcublas
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,10 +67,10 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cuda-cli
|
%{_bindir}/llamacppcublas
|
||||||
%{_bindir}/llama-cuda-server
|
%{_bindir}/llamacppcublasserver
|
||||||
%{_bindir}/llama-cuda-simple
|
%{_bindir}/llamacppcublassimple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacublas.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
%pre
|
%pre
|
|
@ -1,5 +1,5 @@
|
||||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
@ -38,9 +38,9 @@ make -j
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
cp -p main %{buildroot}%{_bindir}/llama
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
cp -p server %{buildroot}%{_bindir}/llamaserver
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||||
|
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -69,9 +69,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cli
|
%{_bindir}/llama
|
||||||
%{_bindir}/llama-server
|
%{_bindir}/llamaserver
|
||||||
%{_bindir}/llama-simple
|
%{_bindir}/llamasimple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
32
.devops/main-cuda.Dockerfile
Normal file
32
.devops/main-cuda.Dockerfile
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
28
.devops/main-intel.Dockerfile
Normal file
28
.devops/main-intel.Dockerfile
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/main /main
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
45
.devops/main-rocm.Dockerfile
Normal file
45
.devops/main-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV LLAMA_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/main" ]
|
29
.devops/main-vulkan.Dockerfile
Normal file
29
.devops/main-vulkan.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake .. -DLLAMA_VULKAN=1 && \
|
||||||
|
cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
WORKDIR /
|
||||||
|
RUN cp /app/build/bin/main /main && \
|
||||||
|
rm -rf /app
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
20
.devops/main.Dockerfile
Normal file
20
.devops/main.Dockerfile
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
|
@ -1,108 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y \
|
|
||||||
build-essential \
|
|
||||||
cmake \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
git \
|
|
||||||
libcurl4-openssl-dev \
|
|
||||||
libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -6,10 +6,11 @@
|
||||||
let
|
let
|
||||||
inherit (config.packages) default;
|
inherit (config.packages) default;
|
||||||
binaries = [
|
binaries = [
|
||||||
"llama-cli"
|
"llama"
|
||||||
"llama-embedding"
|
"llama-embedding"
|
||||||
"llama-server"
|
"llama-server"
|
||||||
"llama-quantize"
|
"quantize"
|
||||||
|
"train-text-from-scratch"
|
||||||
];
|
];
|
||||||
mkApp = name: {
|
mkApp = name: {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
|
|
@ -1,52 +1,13 @@
|
||||||
{ inputs, ... }:
|
|
||||||
|
|
||||||
{
|
{
|
||||||
perSystem =
|
perSystem =
|
||||||
{
|
{ config, lib, ... }:
|
||||||
config,
|
|
||||||
lib,
|
|
||||||
system,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
{
|
{
|
||||||
devShells =
|
devShells =
|
||||||
let
|
lib.concatMapAttrs
|
||||||
pkgs = import inputs.nixpkgs { inherit system; };
|
(name: package: {
|
||||||
stdenv = pkgs.stdenv;
|
${name} = package.passthru.shell;
|
||||||
scripts = config.packages.python-scripts;
|
${name + "-extra"} = package.passthru.shell-extra;
|
||||||
in
|
})
|
||||||
lib.pipe (config.packages) [
|
config.packages;
|
||||||
(lib.concatMapAttrs (
|
|
||||||
name: package: {
|
|
||||||
${name} = pkgs.mkShell {
|
|
||||||
name = "${name}";
|
|
||||||
inputsFrom = [ package ];
|
|
||||||
shellHook = ''
|
|
||||||
echo "Entering ${name} devShell"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
"${name}-extra" =
|
|
||||||
if (name == "python-scripts") then
|
|
||||||
null
|
|
||||||
else
|
|
||||||
pkgs.mkShell {
|
|
||||||
name = "${name}-extra";
|
|
||||||
inputsFrom = [
|
|
||||||
package
|
|
||||||
scripts
|
|
||||||
];
|
|
||||||
# Extra packages that *may* be used by some scripts
|
|
||||||
packages = [
|
|
||||||
pkgs.python3Packages.tiktoken
|
|
||||||
];
|
|
||||||
shellHook = ''
|
|
||||||
echo "Entering ${name} devShell"
|
|
||||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
}
|
|
||||||
))
|
|
||||||
(lib.filterAttrs (name: value: value != null))
|
|
||||||
];
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,14 +26,16 @@
|
||||||
config.cudaSupport = true;
|
config.cudaSupport = true;
|
||||||
config.allowUnfreePredicate =
|
config.allowUnfreePredicate =
|
||||||
p:
|
p:
|
||||||
builtins.all (
|
builtins.all
|
||||||
license:
|
(
|
||||||
license.free
|
license:
|
||||||
|| builtins.elem license.shortName [
|
license.free
|
||||||
"CUDA EULA"
|
|| builtins.elem license.shortName [
|
||||||
"cuDNN EULA"
|
"CUDA EULA"
|
||||||
]
|
"cuDNN EULA"
|
||||||
) (p.meta.licenses or [ p.meta.license ]);
|
]
|
||||||
|
)
|
||||||
|
(p.meta.licenses or [ p.meta.license ]);
|
||||||
};
|
};
|
||||||
# Ensure dependencies use ROCm consistently
|
# Ensure dependencies use ROCm consistently
|
||||||
pkgsRocm = import inputs.nixpkgs {
|
pkgsRocm = import inputs.nixpkgs {
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
llamaVersion,
|
|
||||||
numpy,
|
|
||||||
tqdm,
|
|
||||||
sentencepiece,
|
|
||||||
pyyaml,
|
|
||||||
poetry-core,
|
|
||||||
buildPythonPackage,
|
|
||||||
pytestCheckHook,
|
|
||||||
}:
|
|
||||||
|
|
||||||
buildPythonPackage {
|
|
||||||
pname = "gguf";
|
|
||||||
version = llamaVersion;
|
|
||||||
pyproject = true;
|
|
||||||
nativeBuildInputs = [ poetry-core ];
|
|
||||||
propagatedBuildInputs = [
|
|
||||||
numpy
|
|
||||||
tqdm
|
|
||||||
sentencepiece
|
|
||||||
pyyaml
|
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../gguf-py;
|
|
||||||
pythonImportsCheck = [
|
|
||||||
"numpy"
|
|
||||||
"gguf"
|
|
||||||
];
|
|
||||||
nativeCheckInputs = [ pytestCheckHook ];
|
|
||||||
doCheck = true;
|
|
||||||
meta = with lib; {
|
|
||||||
description = "Python package for writing binary files in the GGUF format";
|
|
||||||
license = licenses.mit;
|
|
||||||
maintainers = [ maintainers.ditsuke ];
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -3,45 +3,40 @@
|
||||||
glibc,
|
glibc,
|
||||||
config,
|
config,
|
||||||
stdenv,
|
stdenv,
|
||||||
runCommand,
|
mkShell,
|
||||||
cmake,
|
cmake,
|
||||||
ninja,
|
ninja,
|
||||||
pkg-config,
|
pkg-config,
|
||||||
git,
|
git,
|
||||||
|
python3,
|
||||||
mpi,
|
mpi,
|
||||||
blas,
|
openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
|
||||||
cudaPackages,
|
cudaPackages,
|
||||||
autoAddDriverRunpath,
|
|
||||||
darwin,
|
darwin,
|
||||||
rocmPackages,
|
rocmPackages,
|
||||||
vulkan-headers,
|
vulkan-headers,
|
||||||
vulkan-loader,
|
vulkan-loader,
|
||||||
curl,
|
clblast,
|
||||||
shaderc,
|
useBlas ? builtins.all (x: !x) [
|
||||||
useBlas ?
|
useCuda
|
||||||
builtins.all (x: !x) [
|
useMetalKit
|
||||||
useCuda
|
useOpenCL
|
||||||
useMetalKit
|
useRocm
|
||||||
useRocm
|
useVulkan
|
||||||
useVulkan
|
],
|
||||||
]
|
|
||||||
&& blas.meta.available,
|
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||||
# Increases the runtime closure size by ~700M
|
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||||
useMpi ? false,
|
useOpenCL ? false,
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
|
|
||||||
enableCurl ? true,
|
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||||
|
|
||||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
||||||
# otherwise we get libstdc++ errors downstream.
|
# otherwise we get libstdc++ errors downstream.
|
||||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
enableStatic ? effectiveStdenv.hostPlatform.isStatic
|
||||||
precompileMetalShaders ? false,
|
}@inputs:
|
||||||
}:
|
|
||||||
|
|
||||||
let
|
let
|
||||||
inherit (lib)
|
inherit (lib)
|
||||||
|
@ -49,6 +44,7 @@ let
|
||||||
cmakeFeature
|
cmakeFeature
|
||||||
optionals
|
optionals
|
||||||
strings
|
strings
|
||||||
|
versionOlder
|
||||||
;
|
;
|
||||||
|
|
||||||
stdenv = throw "Use effectiveStdenv instead";
|
stdenv = throw "Use effectiveStdenv instead";
|
||||||
|
@ -58,20 +54,38 @@ let
|
||||||
++ lib.optionals useCuda [ "CUDA" ]
|
++ lib.optionals useCuda [ "CUDA" ]
|
||||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||||
++ lib.optionals useMpi [ "MPI" ]
|
++ lib.optionals useMpi [ "MPI" ]
|
||||||
|
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||||
++ lib.optionals useRocm [ "ROCm" ]
|
++ lib.optionals useRocm [ "ROCm" ]
|
||||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||||
|
|
||||||
pnameSuffix =
|
pnameSuffix =
|
||||||
strings.optionalString (suffices != [ ])
|
strings.optionalString (suffices != [ ])
|
||||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
||||||
descriptionSuffix = strings.optionalString (
|
descriptionSuffix =
|
||||||
suffices != [ ]
|
strings.optionalString (suffices != [ ])
|
||||||
) ", accelerated with ${strings.concatStringsSep ", " suffices}";
|
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||||
|
|
||||||
xcrunHost = runCommand "xcrunHost" { } ''
|
# TODO: package the Python in this repository in a Nix-like way.
|
||||||
mkdir -p $out/bin
|
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
||||||
ln -s /usr/bin/xcrun $out/bin
|
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
||||||
'';
|
# https://peps.python.org/pep-0517/
|
||||||
|
llama-python = python3.withPackages (
|
||||||
|
ps: [
|
||||||
|
ps.numpy
|
||||||
|
ps.sentencepiece
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||||
|
llama-python-extra = python3.withPackages (
|
||||||
|
ps: [
|
||||||
|
ps.numpy
|
||||||
|
ps.sentencepiece
|
||||||
|
ps.tiktoken
|
||||||
|
ps.torchWithoutCuda
|
||||||
|
ps.transformers
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
|
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
|
||||||
# separately
|
# separately
|
||||||
|
@ -85,9 +99,16 @@ let
|
||||||
++ optionals useMetalKit [ MetalKit ];
|
++ optionals useMetalKit [ MetalKit ];
|
||||||
|
|
||||||
cudaBuildInputs = with cudaPackages; [
|
cudaBuildInputs = with cudaPackages; [
|
||||||
cuda_cudart
|
cuda_cccl.dev # <nv/target>
|
||||||
cuda_cccl # <nv/target>
|
|
||||||
libcublas
|
# A temporary hack for reducing the closure size, remove once cudaPackages
|
||||||
|
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
||||||
|
cuda_cudart.dev
|
||||||
|
cuda_cudart.lib
|
||||||
|
cuda_cudart.static
|
||||||
|
libcublas.dev
|
||||||
|
libcublas.lib
|
||||||
|
libcublas.static
|
||||||
];
|
];
|
||||||
|
|
||||||
rocmBuildInputs = with rocmPackages; [
|
rocmBuildInputs = with rocmPackages; [
|
||||||
|
@ -99,149 +120,178 @@ let
|
||||||
vulkanBuildInputs = [
|
vulkanBuildInputs = [
|
||||||
vulkan-headers
|
vulkan-headers
|
||||||
vulkan-loader
|
vulkan-loader
|
||||||
shaderc
|
|
||||||
];
|
];
|
||||||
in
|
in
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (finalAttrs: {
|
effectiveStdenv.mkDerivation (
|
||||||
pname = "llama-cpp${pnameSuffix}";
|
finalAttrs: {
|
||||||
version = llamaVersion;
|
pname = "llama-cpp${pnameSuffix}";
|
||||||
|
version = llamaVersion;
|
||||||
|
|
||||||
# Note: none of the files discarded here are visible in the sandbox or
|
# Note: none of the files discarded here are visible in the sandbox or
|
||||||
# affect the output hash. This also means they can be modified without
|
# affect the output hash. This also means they can be modified without
|
||||||
# triggering a rebuild.
|
# triggering a rebuild.
|
||||||
src = lib.cleanSourceWith {
|
src = lib.cleanSourceWith {
|
||||||
filter =
|
filter =
|
||||||
name: type:
|
name: type:
|
||||||
let
|
let
|
||||||
noneOf = builtins.all (x: !x);
|
noneOf = builtins.all (x: !x);
|
||||||
baseName = baseNameOf name;
|
baseName = baseNameOf name;
|
||||||
in
|
in
|
||||||
noneOf [
|
noneOf [
|
||||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
||||||
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
||||||
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
||||||
(baseName == "flake.lock")
|
(baseName == "flake.lock")
|
||||||
|
];
|
||||||
|
src = lib.cleanSource ../../.;
|
||||||
|
};
|
||||||
|
|
||||||
|
postPatch = ''
|
||||||
|
substituteInPlace ./ggml-metal.m \
|
||||||
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
|
|
||||||
|
# TODO: Package up each Python script or service appropriately.
|
||||||
|
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
|
||||||
|
# we could make those *.py into setuptools' entrypoints
|
||||||
|
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
|
||||||
|
'';
|
||||||
|
|
||||||
|
nativeBuildInputs =
|
||||||
|
[
|
||||||
|
cmake
|
||||||
|
ninja
|
||||||
|
pkg-config
|
||||||
|
git
|
||||||
|
]
|
||||||
|
++ optionals useCuda [
|
||||||
|
cudaPackages.cuda_nvcc
|
||||||
|
|
||||||
|
# TODO: Replace with autoAddDriverRunpath
|
||||||
|
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
||||||
|
cudaPackages.autoAddOpenGLRunpathHook
|
||||||
|
]
|
||||||
|
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
|
||||||
|
glibc.static
|
||||||
];
|
];
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
|
|
||||||
postPatch = ''
|
buildInputs =
|
||||||
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
++ optionals useCuda cudaBuildInputs
|
||||||
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
++ optionals useMpi [ mpi ]
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
++ optionals useOpenCL [ clblast ]
|
||||||
'';
|
++ optionals useRocm rocmBuildInputs
|
||||||
|
++ optionals useVulkan vulkanBuildInputs;
|
||||||
|
|
||||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
cmakeFlags =
|
||||||
# `default.metallib` may be compiled with Metal compiler from XCode
|
[
|
||||||
# and we need to escape sandbox on MacOS to access Metal compiler.
|
(cmakeBool "LLAMA_NATIVE" false)
|
||||||
# `xcrun` is used find the path of the Metal compiler, which is varible
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
# and not on $PATH
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
(cmakeBool "LLAMA_BLAS" useBlas)
|
||||||
|
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
||||||
nativeBuildInputs =
|
(cmakeBool "LLAMA_CUBLAS" useCuda)
|
||||||
[
|
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
||||||
cmake
|
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||||
ninja
|
(cmakeBool "LLAMA_MPI" useMpi)
|
||||||
pkg-config
|
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
||||||
git
|
(cmakeBool "LLAMA_STATIC" enableStatic)
|
||||||
]
|
]
|
||||||
++ optionals useCuda [
|
++ optionals useCuda [
|
||||||
cudaPackages.cuda_nvcc
|
(
|
||||||
|
with cudaPackages.flags;
|
||||||
autoAddDriverRunpath
|
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
||||||
]
|
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
||||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
|
)
|
||||||
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
|
|
||||||
|
|
||||||
buildInputs =
|
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
|
||||||
++ optionals useCuda cudaBuildInputs
|
|
||||||
++ optionals useMpi [ mpi ]
|
|
||||||
++ optionals useRocm rocmBuildInputs
|
|
||||||
++ optionals useBlas [ blas ]
|
|
||||||
++ optionals useVulkan vulkanBuildInputs
|
|
||||||
++ optionals enableCurl [ curl ];
|
|
||||||
|
|
||||||
cmakeFlags =
|
|
||||||
[
|
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
|
||||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
|
||||||
(cmakeBool "GGML_HIP" useRocm)
|
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
|
||||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
|
||||||
(cmakeBool "GGML_STATIC" enableStatic)
|
|
||||||
]
|
|
||||||
++ optionals useCuda [
|
|
||||||
(
|
|
||||||
with cudaPackages.flags;
|
|
||||||
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
|
||||||
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
|
||||||
)
|
)
|
||||||
)
|
]
|
||||||
]
|
++ optionals useRocm [
|
||||||
++ optionals useRocm [
|
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
|
||||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
|
||||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
|
|
||||||
]
|
|
||||||
++ optionals useMetalKit [
|
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
|
||||||
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
|
||||||
];
|
|
||||||
|
|
||||||
# Environment variables needed for ROCm
|
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||||
env = optionals useRocm {
|
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||||
ROCM_PATH = "${rocmPackages.clr}";
|
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||||
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
# Should likely use `rocmPackages.clr.gpuTargets`.
|
||||||
};
|
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||||
|
]
|
||||||
|
++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
|
||||||
|
++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
|
||||||
|
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mkdir -p $out/include
|
mv $out/bin/main $out/bin/llama
|
||||||
cp $src/include/llama.h $out/include/
|
mv $out/bin/server $out/bin/llama-server
|
||||||
'';
|
mkdir -p $out/include
|
||||||
|
cp $src/llama.h $out/include/
|
||||||
|
'';
|
||||||
|
|
||||||
meta = {
|
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
passthru = {
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
inherit
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
useBlas
|
||||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
useCuda
|
||||||
|
useMetalKit
|
||||||
|
useMpi
|
||||||
|
useOpenCL
|
||||||
|
useRocm
|
||||||
|
useVulkan
|
||||||
|
;
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
shell = mkShell {
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
name = "shell-${finalAttrs.finalPackage.name}";
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
description = "contains numpy and sentencepiece";
|
||||||
|
buildInputs = [ llama-python ];
|
||||||
|
inputsFrom = [ finalAttrs.finalPackage ];
|
||||||
|
shellHook = ''
|
||||||
|
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
shell-extra = mkShell {
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
||||||
license = lib.licenses.mit;
|
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
||||||
|
buildInputs = [ llama-python-extra ];
|
||||||
|
inputsFrom = [ finalAttrs.finalPackage ];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
meta = {
|
||||||
mainProgram = "llama-cli";
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
|
# cudaPackages would've refused to evaluate anyway.
|
||||||
|
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
# Configurations that are known to result in build failures. Can be
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||||
# Consider adding yourself to this list if you want to ensure this flake
|
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||||
# stays maintained and you're willing to invest your time. Do not add
|
|
||||||
# other people without their consent. Consider removing people after
|
|
||||||
# they've been unreachable for long periods of time.
|
|
||||||
|
|
||||||
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||||
# an attrset following the same format as in
|
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||||
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
license = lib.licenses.mit;
|
||||||
maintainers = with lib.maintainers; [
|
|
||||||
philiptaron
|
|
||||||
SomeoneSerge
|
|
||||||
];
|
|
||||||
|
|
||||||
# Extend `badPlatforms` instead
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
platforms = lib.platforms.all;
|
mainProgram = "llama";
|
||||||
};
|
|
||||||
})
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
|
# Consider adding yourself to this list if you want to ensure this flake
|
||||||
|
# stays maintained and you're willing to invest your time. Do not add
|
||||||
|
# other people without their consent. Consider removing people after
|
||||||
|
# they've been unreachable for long periods of time.
|
||||||
|
|
||||||
|
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
||||||
|
# an attrset following the same format as in
|
||||||
|
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
||||||
|
maintainers = with lib.maintainers; [
|
||||||
|
philiptaron
|
||||||
|
SomeoneSerge
|
||||||
|
];
|
||||||
|
|
||||||
|
# Extend `badPlatforms` instead
|
||||||
|
platforms = lib.platforms.all;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
stdenv,
|
|
||||||
buildPythonPackage,
|
|
||||||
poetry-core,
|
|
||||||
mkShell,
|
|
||||||
python3Packages,
|
|
||||||
gguf-py,
|
|
||||||
}@inputs:
|
|
||||||
|
|
||||||
let
|
|
||||||
llama-python-deps = with python3Packages; [
|
|
||||||
numpy
|
|
||||||
sentencepiece
|
|
||||||
transformers
|
|
||||||
protobuf
|
|
||||||
torchWithoutCuda
|
|
||||||
gguf-py
|
|
||||||
tqdm
|
|
||||||
|
|
||||||
# for scripts/compare-llama-bench.py
|
|
||||||
gitpython
|
|
||||||
tabulate
|
|
||||||
|
|
||||||
# for examples/pydantic-models-to-grammar-examples.py
|
|
||||||
docstring-parser
|
|
||||||
pydantic
|
|
||||||
|
|
||||||
];
|
|
||||||
|
|
||||||
llama-python-test-deps = with python3Packages; [
|
|
||||||
# Server bench
|
|
||||||
matplotlib
|
|
||||||
|
|
||||||
# server tests
|
|
||||||
openai
|
|
||||||
pytest
|
|
||||||
prometheus-client
|
|
||||||
];
|
|
||||||
in
|
|
||||||
|
|
||||||
buildPythonPackage ({
|
|
||||||
pname = "llama-scripts";
|
|
||||||
version = "0.0.0";
|
|
||||||
pyproject = true;
|
|
||||||
|
|
||||||
# NOTE: The files filtered out here are not visible in the build sandbox, neither
|
|
||||||
# do they affect the output hash. They can be modified without triggering a rebuild.
|
|
||||||
src = lib.cleanSourceWith {
|
|
||||||
filter =
|
|
||||||
name: type:
|
|
||||||
let
|
|
||||||
any = builtins.any (x: x);
|
|
||||||
baseName = builtins.baseNameOf name;
|
|
||||||
in
|
|
||||||
any [
|
|
||||||
(lib.hasSuffix ".py" name)
|
|
||||||
(baseName == "README.md")
|
|
||||||
(baseName == "pyproject.toml")
|
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
nativeBuildInputs = [ poetry-core ];
|
|
||||||
nativeCheckInputs = llama-python-test-deps;
|
|
||||||
dependencies = llama-python-deps;
|
|
||||||
})
|
|
|
@ -1,41 +1,19 @@
|
||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
newScope,
|
newScope,
|
||||||
python3,
|
|
||||||
llamaVersion ? "0.0.0",
|
llamaVersion ? "0.0.0",
|
||||||
}:
|
}:
|
||||||
|
|
||||||
let
|
|
||||||
pythonPackages = python3.pkgs;
|
|
||||||
buildPythonPackage = pythonPackages.buildPythonPackage;
|
|
||||||
numpy = pythonPackages.numpy;
|
|
||||||
tqdm = pythonPackages.tqdm;
|
|
||||||
sentencepiece = pythonPackages.sentencepiece;
|
|
||||||
pyyaml = pythonPackages.pyyaml;
|
|
||||||
poetry-core = pythonPackages.poetry-core;
|
|
||||||
pytestCheckHook = pythonPackages.pytestCheckHook;
|
|
||||||
in
|
|
||||||
|
|
||||||
# We're using `makeScope` instead of just writing out an attrset
|
# We're using `makeScope` instead of just writing out an attrset
|
||||||
# because it allows users to apply overlays later using `overrideScope'`.
|
# because it allows users to apply overlays later using `overrideScope'`.
|
||||||
# Cf. https://noogle.dev/f/lib/makeScope
|
# Cf. https://noogle.dev/f/lib/makeScope
|
||||||
|
|
||||||
lib.makeScope newScope (self: {
|
lib.makeScope newScope (
|
||||||
inherit llamaVersion;
|
self: {
|
||||||
gguf-py = self.callPackage ./package-gguf-py.nix {
|
inherit llamaVersion;
|
||||||
inherit
|
llama-cpp = self.callPackage ./package.nix { };
|
||||||
buildPythonPackage
|
docker = self.callPackage ./docker.nix { };
|
||||||
numpy
|
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||||
tqdm
|
sif = self.callPackage ./sif.nix { };
|
||||||
sentencepiece
|
}
|
||||||
poetry-core
|
)
|
||||||
pyyaml
|
|
||||||
pytestCheckHook
|
|
||||||
;
|
|
||||||
};
|
|
||||||
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
|
||||||
docker = self.callPackage ./docker.nix { };
|
|
||||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
|
||||||
sif = self.callPackage ./sif.nix { };
|
|
||||||
})
|
|
||||||
|
|
|
@ -1,113 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=24.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=6.3
|
|
||||||
ARG AMDGPU_VERSION=6.3
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
### Build image
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
|
|
||||||
# gfx906 is deprecated
|
|
||||||
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
|
|
||||||
|
|
||||||
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
|
||||||
ARG ROCM_DOCKER_ARCH=gfx1100
|
|
||||||
|
|
||||||
# Set nvcc architectured
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
# ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
# ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
build-essential \
|
|
||||||
cmake \
|
|
||||||
git \
|
|
||||||
libcurl4-openssl-dev \
|
|
||||||
curl \
|
|
||||||
libgomp1
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
|
||||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
|
||||||
&& cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib \
|
|
||||||
&& find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3-pip \
|
|
||||||
python3 \
|
|
||||||
python3-wheel\
|
|
||||||
&& pip install --break-system-packages --upgrade setuptools \
|
|
||||||
&& pip install --break-system-packages -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
32
.devops/server-cuda.Dockerfile
Normal file
32
.devops/server-cuda.Dockerfile
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/server /server
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
28
.devops/server-intel.Dockerfile
Normal file
28
.devops/server-intel.Dockerfile
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build . --config Release --target server
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/server /server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
45
.devops/server-rocm.Dockerfile
Normal file
45
.devops/server-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV LLAMA_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/server" ]
|
29
.devops/server-vulkan.Dockerfile
Normal file
29
.devops/server-vulkan.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake .. -DLLAMA_VULKAN=1 && \
|
||||||
|
cmake --build . --config Release --target server
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
WORKDIR /
|
||||||
|
RUN cp /app/build/bin/server /server && \
|
||||||
|
rm -rf /app
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
20
.devops/server.Dockerfile
Normal file
20
.devops/server.Dockerfile
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/server /server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
|
@ -8,40 +8,36 @@ arg1="$1"
|
||||||
shift
|
shift
|
||||||
|
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
exec python3 ./convert_hf_to_gguf.py "$@"
|
python3 ./convert.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
exec ./llama-quantize "$@"
|
./quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
exec ./llama-cli "$@"
|
./main "$@"
|
||||||
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||||
exec ./llama-bench "$@"
|
./finetune "$@"
|
||||||
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
|
||||||
exec ./llama-perplexity "$@"
|
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in $(ls $1/$2/ggml-model-f16.bin*); do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
if [ -f "${i/f16/q4_0}" ]; then
|
if [ -f "${i/f16/q4_0}" ]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
./quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
exec ./llama-server "$@"
|
./server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
echo " --run (-r): Run a model previously converted into ggml"
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
|
||||||
echo " ex: -m model.gguf"
|
|
||||||
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
|
||||||
echo " ex: -m model.gguf -f file.txt"
|
|
||||||
echo " --convert (-c): Convert a llama model into ggml"
|
echo " --convert (-c): Convert a llama model into ggml"
|
||||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
|
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
|
||||||
|
echo " See documentation for finetune for command-line parameters"
|
||||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||||
echo " ex: \"/models/\" 7B"
|
echo " ex: \"/models/\" 7B"
|
||||||
echo " --server (-s): Run a model on the server"
|
echo " --server (-s): Run a model on the server"
|
||||||
|
|
|
@ -1,89 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=24.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
|
||||||
|
|
||||||
# Install Vulkan SDK and cURL
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl libvulkan-dev \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
python3-wheel \
|
|
||||||
&& pip install --break-system-packages --upgrade setuptools \
|
|
||||||
&& pip install --break-system-packages -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -1,7 +1,7 @@
|
||||||
*.o
|
*.o
|
||||||
*.a
|
*.a
|
||||||
.cache/
|
.cache/
|
||||||
# Do not ignore .git directory, otherwise the reported build number will always be 0
|
.git/
|
||||||
.github/
|
.github/
|
||||||
.gitignore
|
.gitignore
|
||||||
.vs/
|
.vs/
|
||||||
|
@ -12,8 +12,8 @@ build*/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
/llama-cli
|
/main
|
||||||
/llama-quantize
|
/quantize
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
2
.ecrc
2
.ecrc
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
|
"Exclude": ["^\\.gitmodules$"],
|
||||||
"Disable": {
|
"Disable": {
|
||||||
"IndentSize": true
|
"IndentSize": true
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,27 +24,5 @@ insert_final_newline = unset
|
||||||
[examples/server/public/*]
|
[examples/server/public/*]
|
||||||
indent_size = 2
|
indent_size = 2
|
||||||
|
|
||||||
[examples/server/public/deps_*]
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
|
|
||||||
[examples/server/deps_*]
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
[examples/cvector-generator/*.txt]
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
insert_final_newline = unset
|
|
||||||
|
|
||||||
[models/templates/*.jinja]
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
end_of_line = unset
|
|
||||||
charset = unset
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
insert_final_newline = unset
|
|
||||||
|
|
16
.flake8
16
.flake8
|
@ -1,17 +1,3 @@
|
||||||
[flake8]
|
[flake8]
|
||||||
max-line-length = 125
|
max-line-length = 125
|
||||||
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
ignore = W503
|
||||||
exclude =
|
|
||||||
# Do not traverse examples
|
|
||||||
examples,
|
|
||||||
# Do not include package initializers
|
|
||||||
__init__.py,
|
|
||||||
# No need to traverse our git directory
|
|
||||||
.git,
|
|
||||||
# There's no value in checking cache directories
|
|
||||||
__pycache__,
|
|
||||||
# No need to include the build path
|
|
||||||
build,
|
|
||||||
# This contains builds that we don't want to check
|
|
||||||
dist # This is generated with `python build .` for package releases
|
|
||||||
# max-complexity = 10
|
|
||||||
|
|
87
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
87
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
|
@ -1,87 +0,0 @@
|
||||||
name: Bug (compilation)
|
|
||||||
description: Something goes wrong when trying to compile llama.cpp.
|
|
||||||
title: "Compile bug: "
|
|
||||||
labels: ["bug-unconfirmed", "compilation"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for bug reports where the compilation of llama.cpp fails.
|
|
||||||
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
|
|
||||||
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
|
||||||
by clearing `~/.cache/ccache` (on Linux).
|
|
||||||
- type: textarea
|
|
||||||
id: commit
|
|
||||||
attributes:
|
|
||||||
label: Git commit
|
|
||||||
description: Which commit are you trying to compile?
|
|
||||||
placeholder: |
|
|
||||||
$git rev-parse HEAD
|
|
||||||
84a07a17b1b08cf2b9747c633a2372782848a27f
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: backends
|
|
||||||
attributes:
|
|
||||||
label: GGML backends
|
|
||||||
description: Which GGML backends do you know to be affected?
|
|
||||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
|
||||||
multiple: true
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
|
||||||
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
|
|
||||||
placeholder: >
|
|
||||||
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
|
|
||||||
Here are the exact commands that I used: ...
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: command
|
|
||||||
attributes:
|
|
||||||
label: Compile command
|
|
||||||
description: >
|
|
||||||
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
Please copy and paste any relevant log output, including any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
|
@ -1,101 +0,0 @@
|
||||||
name: Bug (model use)
|
|
||||||
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
|
|
||||||
title: "Eval bug: "
|
|
||||||
labels: ["bug-unconfirmed", "model evaluation"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for bug reports where the model evaluation results
|
|
||||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
|
||||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: backends
|
|
||||||
attributes:
|
|
||||||
label: GGML backends
|
|
||||||
description: Which GGML backends do you know to be affected?
|
|
||||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
|
||||||
multiple: true
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: hardware
|
|
||||||
attributes:
|
|
||||||
label: Hardware
|
|
||||||
description: Which CPUs/GPUs are you using?
|
|
||||||
placeholder: >
|
|
||||||
e.g. Ryzen 5950X + 2x RTX 4090
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: model
|
|
||||||
attributes:
|
|
||||||
label: Models
|
|
||||||
description: >
|
|
||||||
Which model(s) at which quantization were you using when encountering the bug?
|
|
||||||
If you downloaded a GGUF file off of Huggingface, please provide a link.
|
|
||||||
placeholder: >
|
|
||||||
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
|
||||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
|
||||||
that information would be very much appreciated by us.
|
|
||||||
placeholder: >
|
|
||||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
|
||||||
When I use -ngl 0 it works correctly.
|
|
||||||
Here are the exact commands that I used: ...
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
91
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
91
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
|
@ -1,91 +0,0 @@
|
||||||
name: Bug (misc.)
|
|
||||||
description: Something is not working the way it should (and it's not covered by any of the above cases).
|
|
||||||
title: "Misc. bug: "
|
|
||||||
labels: ["bug-unconfirmed"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for miscellaneous bugs that don't fit into any other category.
|
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which version of our software is affected? (You can use `--version` to get a version string.)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: dropdown
|
|
||||||
id: module
|
|
||||||
attributes:
|
|
||||||
label: Which llama.cpp modules do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Documentation/Github
|
|
||||||
- libllama (core library)
|
|
||||||
- llama-cli
|
|
||||||
- llama-server
|
|
||||||
- llama-bench
|
|
||||||
- llama-quantize
|
|
||||||
- Python/Bash scripts
|
|
||||||
- Test code
|
|
||||||
- Other (Please specify in the next section)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: command
|
|
||||||
attributes:
|
|
||||||
label: Command line
|
|
||||||
description: >
|
|
||||||
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it (if applicable).
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: false
|
|
51
.github/ISSUE_TEMPLATE/020-enhancement.yml
vendored
51
.github/ISSUE_TEMPLATE/020-enhancement.yml
vendored
|
@ -1,51 +0,0 @@
|
||||||
name: Enhancement
|
|
||||||
description: Used to request enhancements for llama.cpp.
|
|
||||||
title: "Feature Request: "
|
|
||||||
labels: ["enhancement"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
|
||||||
|
|
||||||
- type: checkboxes
|
|
||||||
id: prerequisites
|
|
||||||
attributes:
|
|
||||||
label: Prerequisites
|
|
||||||
description: Please confirm the following before submitting your enhancement request.
|
|
||||||
options:
|
|
||||||
- label: I am running the latest code. Mention the version if possible as well.
|
|
||||||
required: true
|
|
||||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
|
||||||
required: true
|
|
||||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
|
||||||
required: true
|
|
||||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: feature-description
|
|
||||||
attributes:
|
|
||||||
label: Feature Description
|
|
||||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
|
||||||
placeholder: Detailed description of the enhancement
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: motivation
|
|
||||||
attributes:
|
|
||||||
label: Motivation
|
|
||||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
|
||||||
placeholder: Explanation of why this feature is needed and its benefits
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: possible-implementation
|
|
||||||
attributes:
|
|
||||||
label: Possible Implementation
|
|
||||||
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
|
||||||
placeholder: Detailed description of potential implementation
|
|
||||||
validations:
|
|
||||||
required: false
|
|
52
.github/ISSUE_TEMPLATE/030-research.yml
vendored
52
.github/ISSUE_TEMPLATE/030-research.yml
vendored
|
@ -1,52 +0,0 @@
|
||||||
name: Research
|
|
||||||
description: Track new technical research area.
|
|
||||||
title: "Research: "
|
|
||||||
labels: ["research 🔬"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
|
||||||
|
|
||||||
- type: checkboxes
|
|
||||||
id: research-stage
|
|
||||||
attributes:
|
|
||||||
label: Research Stage
|
|
||||||
description: Track general state of this research ticket
|
|
||||||
options:
|
|
||||||
- label: Background Research (Let's try to avoid reinventing the wheel)
|
|
||||||
- label: Hypothesis Formed (How do you think this will work and it's effect?)
|
|
||||||
- label: Strategy / Implementation Forming
|
|
||||||
- label: Analysis of results
|
|
||||||
- label: Debrief / Documentation (So people in the future can learn from us)
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: background
|
|
||||||
attributes:
|
|
||||||
label: Previous existing literature and research
|
|
||||||
description: Whats the current state of the art and whats the motivation for this research?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: hypothesis
|
|
||||||
attributes:
|
|
||||||
label: Hypothesis
|
|
||||||
description: How do you think this will work and it's effect?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: implementation
|
|
||||||
attributes:
|
|
||||||
label: Implementation
|
|
||||||
description: Got an approach? e.g. a PR ready to go?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: analysis
|
|
||||||
attributes:
|
|
||||||
label: Analysis
|
|
||||||
description: How does the proposed implementation behave?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
28
.github/ISSUE_TEMPLATE/040-refactor.yml
vendored
28
.github/ISSUE_TEMPLATE/040-refactor.yml
vendored
|
@ -1,28 +0,0 @@
|
||||||
name: Refactor (Maintainers)
|
|
||||||
description: Used to track refactoring opportunities.
|
|
||||||
title: "Refactor: "
|
|
||||||
labels: ["refactor"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
|
||||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: background-description
|
|
||||||
attributes:
|
|
||||||
label: Background Description
|
|
||||||
description: Please provide a detailed written description of the pain points you are trying to solve.
|
|
||||||
placeholder: Detailed description behind your motivation to request refactor
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: possible-approaches
|
|
||||||
attributes:
|
|
||||||
label: Possible Refactor Approaches
|
|
||||||
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
|
|
||||||
placeholder: Your idea of possible refactoring opportunity/approaches
|
|
||||||
validations:
|
|
||||||
required: false
|
|
11
.github/ISSUE_TEMPLATE/bug.md
vendored
Normal file
11
.github/ISSUE_TEMPLATE/bug.md
vendored
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
---
|
||||||
|
name: Bug template
|
||||||
|
about: Used to report bugs in llama.cpp
|
||||||
|
labels: ["bug-unconfirmed"]
|
||||||
|
assignees: ''
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
|
||||||
|
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
11
.github/ISSUE_TEMPLATE/config.yml
vendored
11
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,11 +0,0 @@
|
||||||
blank_issues_enabled: true
|
|
||||||
contact_links:
|
|
||||||
- name: Got an idea?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
|
||||||
about: Pop it there. It may then become an enhancement ticket.
|
|
||||||
- name: Got a question?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
|
||||||
about: Ask a question there!
|
|
||||||
- name: Want to contribute?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
|
||||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
|
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
Normal file
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
---
|
||||||
|
name: Enhancement template
|
||||||
|
about: Used to request enhancements for llama.cpp
|
||||||
|
labels: ["enhancement"]
|
||||||
|
assignees: ''
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Prerequisites
|
||||||
|
|
||||||
|
Please answer the following questions for yourself before submitting an issue.
|
||||||
|
|
||||||
|
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
||||||
|
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||||
|
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
||||||
|
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
||||||
|
|
||||||
|
# Feature Description
|
||||||
|
|
||||||
|
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||||
|
|
||||||
|
# Motivation
|
||||||
|
|
||||||
|
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||||
|
|
||||||
|
# Possible Implementation
|
||||||
|
|
||||||
|
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
86
.github/labeler.yml
vendored
86
.github/labeler.yml
vendored
|
@ -1,86 +0,0 @@
|
||||||
# https://github.com/actions/labeler
|
|
||||||
Kompute:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-kompute.h
|
|
||||||
- ggml/src/ggml-kompute/**
|
|
||||||
- README-kompute.md
|
|
||||||
Apple Metal:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-metal.h
|
|
||||||
- ggml/src/ggml-metal/**
|
|
||||||
- README-metal.md
|
|
||||||
SYCL:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-sycl.h
|
|
||||||
- ggml/src/ggml-sycl/**
|
|
||||||
- docs/backend/SYCL.md
|
|
||||||
- examples/sycl/**
|
|
||||||
Nvidia GPU:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-cuda.h
|
|
||||||
- ggml/src/ggml-cuda/**
|
|
||||||
Vulkan:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-vulkan.h
|
|
||||||
- ggml/src/ggml-vulkan/**
|
|
||||||
documentation:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- docs/**
|
|
||||||
- media/**
|
|
||||||
testing:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- tests/**
|
|
||||||
build:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- cmake/**
|
|
||||||
- CMakeLists.txt
|
|
||||||
- CMakePresets.json
|
|
||||||
examples:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file: examples/**
|
|
||||||
devops:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- .devops/**
|
|
||||||
- .github/**
|
|
||||||
- ci/**
|
|
||||||
python:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- "**/*.py"
|
|
||||||
- requirements/**
|
|
||||||
- gguf-py/**
|
|
||||||
- .flake8
|
|
||||||
script:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- scripts/**
|
|
||||||
android:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- examples/llama.android/**
|
|
||||||
server:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- examples/server/**
|
|
||||||
ggml:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/**
|
|
||||||
nix:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- "**/*.nix"
|
|
||||||
- .github/workflows/nix-*.yml
|
|
||||||
- .devops/nix/nixpkgs-instances.nix
|
|
||||||
embedding:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file: examples/embedding/
|
|
1
.github/pull_request_template.md
vendored
1
.github/pull_request_template.md
vendored
|
@ -1 +0,0 @@
|
||||||
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
|
|
315
.github/workflows/bench.yml.disabled
vendored
315
.github/workflows/bench.yml.disabled
vendored
|
@ -1,315 +0,0 @@
|
||||||
# TODO: there have been some issues with the workflow, so disabling for now
|
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/7893
|
|
||||||
#
|
|
||||||
# Benchmark
|
|
||||||
name: Benchmark
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
gpu-series:
|
|
||||||
description: 'Azure GPU series to run with'
|
|
||||||
required: true
|
|
||||||
type: choice
|
|
||||||
options:
|
|
||||||
- Standard_NC4as_T4_v3
|
|
||||||
- Standard_NC24ads_A100_v4
|
|
||||||
- Standard_NC80adis_H100_v5
|
|
||||||
sha:
|
|
||||||
description: 'Commit SHA1 to build'
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
duration:
|
|
||||||
description: 'Duration of the bench'
|
|
||||||
type: string
|
|
||||||
default: 10m
|
|
||||||
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
|
||||||
pull_request_target:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
|
||||||
schedule:
|
|
||||||
- cron: '04 2 * * *'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
bench-server-baseline:
|
|
||||||
runs-on: Standard_NC4as_T4_v3
|
|
||||||
env:
|
|
||||||
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
|
|
||||||
N_USERS: 8
|
|
||||||
DURATION: 10m
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
model: [phi-2]
|
|
||||||
ftype: [q4_0, q8_0, f16]
|
|
||||||
include:
|
|
||||||
- model: phi-2
|
|
||||||
ftype: q4_0
|
|
||||||
pr_comment_enabled: "true"
|
|
||||||
|
|
||||||
if: |
|
|
||||||
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|
|
||||||
|| (
|
|
||||||
github.event_name == 'schedule'
|
|
||||||
&& github.ref_name == 'master'
|
|
||||||
&& github.repository_owner == 'ggerganov'
|
|
||||||
)
|
|
||||||
|| github.event_name == 'pull_request_target'
|
|
||||||
|| (
|
|
||||||
github.event_name == 'push'
|
|
||||||
&& github.event.ref == 'refs/heads/master'
|
|
||||||
&& github.repository_owner == 'ggerganov'
|
|
||||||
)
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Install python env
|
|
||||||
id: pipenv
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
python3 -m venv venv
|
|
||||||
source venv/bin/activate
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
- name: Prometheus
|
|
||||||
id: install_prometheus
|
|
||||||
run: |
|
|
||||||
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
|
||||||
tar xzf prometheus*.tar.gz --strip-components=1
|
|
||||||
./prometheus --config.file=examples/server/bench/prometheus.yml &
|
|
||||||
while ! nc -z localhost 9090; do
|
|
||||||
sleep 0.1
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Set up Go
|
|
||||||
uses: actions/setup-go@v5
|
|
||||||
with:
|
|
||||||
go-version: '1.21'
|
|
||||||
|
|
||||||
- name: Install k6 and xk6-sse
|
|
||||||
id: k6_installation
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
go install go.k6.io/xk6/cmd/xk6@latest
|
|
||||||
xk6 build master \
|
|
||||||
--with github.com/phymbert/xk6-sse
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: cmake_build
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DLLAMA_CUBLAS=ON \
|
|
||||||
-DCUDAToolkit_ROOT=/usr/local/cuda \
|
|
||||||
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
|
|
||||||
-DCMAKE_CUDA_ARCHITECTURES=75 \
|
|
||||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
|
||||||
-DLLAMA_ALL_WARNINGS=OFF \
|
|
||||||
-DCMAKE_BUILD_TYPE=Release;
|
|
||||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Download the dataset
|
|
||||||
id: download_dataset
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
- name: Server bench
|
|
||||||
id: server_bench
|
|
||||||
env:
|
|
||||||
HEAD_REF: ${{ github.head_ref || github.ref_name }}
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
cd examples/server/bench
|
|
||||||
source venv/bin/activate
|
|
||||||
python bench.py \
|
|
||||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
|
||||||
--name ${{ github.job }} \
|
|
||||||
--branch $HEAD_REF \
|
|
||||||
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
|
||||||
--scenario script.js \
|
|
||||||
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
|
||||||
--hf-repo ggml-org/models \
|
|
||||||
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
|
|
||||||
--model-path-prefix /models \
|
|
||||||
--parallel ${{ env.N_USERS }} \
|
|
||||||
-ngl 33 \
|
|
||||||
--batch-size 2048 \
|
|
||||||
--ubatch-size 256 \
|
|
||||||
--ctx-size 16384 \
|
|
||||||
--n-prompts 1000 \
|
|
||||||
--max-prompt-tokens 1024 \
|
|
||||||
--max-tokens 2048
|
|
||||||
|
|
||||||
cat results.github.env >> $GITHUB_ENV
|
|
||||||
|
|
||||||
# Remove dataset as we do not want it in the artefact
|
|
||||||
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
compression-level: 9
|
|
||||||
path: |
|
|
||||||
examples/server/bench/*.jpg
|
|
||||||
examples/server/bench/*.json
|
|
||||||
examples/server/bench/*.log
|
|
||||||
|
|
||||||
- name: Commit status
|
|
||||||
uses: Sibz/github-status-action@v1
|
|
||||||
with:
|
|
||||||
authToken: ${{secrets.GITHUB_TOKEN}}
|
|
||||||
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
|
|
||||||
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
description: |
|
|
||||||
${{ env.BENCH_RESULTS }}
|
|
||||||
state: 'success'
|
|
||||||
|
|
||||||
- name: Upload benchmark images
|
|
||||||
uses: devicons/public-upload-to-imgur@v2.2.2
|
|
||||||
continue-on-error: true # Important as it looks unstable: 503
|
|
||||||
id: imgur_step
|
|
||||||
with:
|
|
||||||
client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
|
||||||
path: |
|
|
||||||
examples/server/bench/prompt_tokens_seconds.jpg
|
|
||||||
examples/server/bench/predicted_tokens_seconds.jpg
|
|
||||||
examples/server/bench/kv_cache_usage_ratio.jpg
|
|
||||||
examples/server/bench/requests_processing.jpg
|
|
||||||
|
|
||||||
- name: Extract mermaid
|
|
||||||
id: set_mermaid
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
cd examples/server/bench
|
|
||||||
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
|
||||||
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
|
|
||||||
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
|
|
||||||
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
|
|
||||||
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Extract image url
|
|
||||||
id: extract_image_url
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Comment PR
|
|
||||||
uses: mshick/add-pr-comment@v2
|
|
||||||
id: comment_pr
|
|
||||||
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
|
|
||||||
with:
|
|
||||||
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
message: |
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
|
||||||
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>Expand details for performance related PR only</summary>
|
|
||||||
|
|
||||||
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
|
||||||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
|
||||||
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
|
||||||
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
|
|
||||||
- ${{ env.BENCH_GRAPH_XLABEL }}
|
|
||||||
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.PROMPT_TOKENS_SECONDS }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.PREDICTED_TOKENS_SECONDS }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>Details</summary>
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.KV_CACHE_USAGE_RATIO }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.REQUESTS_PROCESSING }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
</p>
|
|
||||||
</details>
|
|
||||||
</details>
|
|
1254
.github/workflows/build.yml
vendored
1254
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load diff
9
.github/workflows/close-issue.yml
vendored
9
.github/workflows/close-issue.yml
vendored
|
@ -3,11 +3,6 @@ on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "42 0 * * *"
|
- cron: "42 0 * * *"
|
||||||
|
|
||||||
# Fine-grant permission
|
|
||||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
||||||
permissions:
|
|
||||||
issues: write
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
close-issues:
|
close-issues:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -17,12 +12,12 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@v5
|
- uses: actions/stale@v5
|
||||||
with:
|
with:
|
||||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
|
exempt-issue-labels: "refactor,help wanted,good first issue,research"
|
||||||
days-before-issue-stale: 30
|
days-before-issue-stale: 30
|
||||||
days-before-issue-close: 14
|
days-before-issue-close: 14
|
||||||
stale-issue-label: "stale"
|
stale-issue-label: "stale"
|
||||||
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
||||||
days-before-pr-stale: -1
|
days-before-pr-stale: -1
|
||||||
days-before-pr-close: -1
|
days-before-pr-close: -1
|
||||||
operations-per-run: 10000
|
operations-per-run: 1000
|
||||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
36
.github/workflows/code-coverage.yml
vendored
Normal file
36
.github/workflows/code-coverage.yml
vendored
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
name: Code Coverage
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run:
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential gcc-8 lcov
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: CC=gcc-8 make test
|
||||||
|
|
||||||
|
- name: Generate coverage report
|
||||||
|
run: |
|
||||||
|
make coverage
|
||||||
|
make lcov-report
|
||||||
|
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v3
|
||||||
|
env:
|
||||||
|
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
with:
|
||||||
|
files: lcov-report/coverage.info
|
156
.github/workflows/docker.yml
vendored
156
.github/workflows/docker.yml
vendored
|
@ -10,50 +10,45 @@
|
||||||
name: Publish Docker image
|
name: Publish Docker image
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # allows manual triggering
|
pull_request:
|
||||||
schedule:
|
push:
|
||||||
# Rebuild daily rather than on every push because it is expensive
|
branches:
|
||||||
- cron: '12 4 * * *'
|
- master
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
# Fine-grant permission
|
|
||||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
||||||
permissions:
|
|
||||||
packages: write
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
push_to_registry:
|
push_to_registry:
|
||||||
name: Push Docker image to Docker Hub
|
name: Push Docker image to Docker Hub
|
||||||
|
if: github.event.pull_request.draft == false
|
||||||
|
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
COMMIT_SHA: ${{ github.sha }}
|
COMMIT_SHA: ${{ github.sha }}
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
# Multi-stage build
|
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
# have disabled them for now until the reason why
|
||||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
# is understood.
|
||||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
with:
|
|
||||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
- name: Log in to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
|
@ -62,45 +57,9 @@ jobs:
|
||||||
username: ${{ github.repository_owner }}
|
username: ${{ github.repository_owner }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Determine tag name
|
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
||||||
id: tag
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
|
||||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
|
||||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
|
||||||
REPO_NAME="${{ github.event.repository.name }}"
|
|
||||||
|
|
||||||
# determine tag name postfix (build number, commit hash)
|
|
||||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
|
||||||
TAG_POSTFIX="-b${BUILD_NUMBER}"
|
|
||||||
else
|
|
||||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
|
||||||
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
|
||||||
fi
|
|
||||||
# list all tags possible
|
|
||||||
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
|
||||||
TYPE=""
|
|
||||||
else
|
|
||||||
TYPE="-${{ matrix.config.tag }}"
|
|
||||||
fi
|
|
||||||
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
|
||||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
|
||||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
|
||||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
|
||||||
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "full_output_tags=$FULLTAGS" # print out for debugging
|
|
||||||
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
|
||||||
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
|
||||||
env:
|
|
||||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
|
||||||
|
|
||||||
- name: Free Disk Space (Ubuntu)
|
- name: Free Disk Space (Ubuntu)
|
||||||
if: ${{ matrix.config.free_disk_space == true }}
|
uses: jlumbroso/free-disk-space@main
|
||||||
uses: ggml-org/free-disk-space@v1.3.1
|
|
||||||
with:
|
with:
|
||||||
# this might remove tools that are actually needed,
|
# this might remove tools that are actually needed,
|
||||||
# if set to "true" but frees about 6 GB
|
# if set to "true" but frees about 6 GB
|
||||||
|
@ -115,59 +74,34 @@ jobs:
|
||||||
docker-images: true
|
docker-images: true
|
||||||
swap-storage: true
|
swap-storage: true
|
||||||
|
|
||||||
- name: Build and push Full Docker image (tagged + versioned)
|
- name: Determine tag name
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
id: tag
|
||||||
uses: docker/build-push-action@v6
|
shell: bash
|
||||||
with:
|
run: |
|
||||||
context: .
|
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||||
push: true
|
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||||
platforms: ${{ matrix.config.platforms }}
|
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||||
# tag list is generated from step above
|
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||||
tags: ${{ steps.tag.outputs.full_output_tags }}
|
else
|
||||||
file: ${{ matrix.config.dockerfile }}
|
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||||
target: full
|
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||||
provenance: false
|
fi
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
||||||
- name: Build and push Light Docker image (tagged + versioned)
|
- name: Build and push Docker image (versioned)
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
if: github.event_name == 'push'
|
||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
# tag list is generated from step above
|
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
tags: ${{ steps.tag.outputs.light_output_tags }}
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
target: light
|
|
||||||
provenance: false
|
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
||||||
- name: Build and push Server Docker image (tagged + versioned)
|
- name: Build and push Docker image (tagged)
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
uses: docker/build-push-action@v4
|
||||||
uses: docker/build-push-action@v6
|
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
# tag list is generated from step above
|
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
tags: ${{ steps.tag.outputs.server_output_tags }}
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
target: server
|
|
||||||
provenance: false
|
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
10
.github/workflows/editorconfig.yml
vendored
10
.github/workflows/editorconfig.yml
vendored
|
@ -14,16 +14,10 @@ on:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
editorconfig:
|
editorconfig:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v3
|
||||||
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
- uses: editorconfig-checker/action-editorconfig-checker@main
|
||||||
with:
|
|
||||||
version: v3.0.3
|
|
||||||
- run: editorconfig-checker
|
- run: editorconfig-checker
|
||||||
|
|
4
.github/workflows/gguf-publish.yml
vendored
4
.github/workflows/gguf-publish.yml
vendored
|
@ -24,9 +24,9 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
python-version: '3.9.x'
|
python-version: '3.9.x'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|
17
.github/workflows/labeler.yml
vendored
17
.github/workflows/labeler.yml
vendored
|
@ -1,17 +0,0 @@
|
||||||
name: "Pull Request Labeler"
|
|
||||||
on:
|
|
||||||
- pull_request_target
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
labeler:
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
pull-requests: write
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
repository: "ggerganov/llama.cpp"
|
|
||||||
- uses: actions/labeler@v5
|
|
||||||
with:
|
|
||||||
configuration-path: '.github/labeler.yml'
|
|
61
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
61
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
name: Nix aarch64 builds
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
schedule:
|
||||||
|
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
||||||
|
# 1.5h instead of minutes with the cold cache).
|
||||||
|
#
|
||||||
|
# randint(0, 59), randint(0, 23)
|
||||||
|
- cron: '26 12 * * *'
|
||||||
|
# But also rebuild if we touched any of the Nix expressions:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: ['**/*.nix', 'flake.lock']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: ['**/*.nix', 'flake.lock']
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
nix-build-aarch64:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install QEMU
|
||||||
|
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||||
|
sudo usermod -a -G kvm $USER
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-platforms = aarch64-linux
|
||||||
|
extra-system-features = nixos-test kvm
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: Set-up cachix to push the results to
|
||||||
|
uses: cachix/cachix-action@v13
|
||||||
|
with:
|
||||||
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
|
name: llama-cpp
|
||||||
|
- name: Show all output paths
|
||||||
|
run: >
|
||||||
|
nix run github:nix-community/nix-eval-jobs
|
||||||
|
-- --gc-roots-dir gcroot
|
||||||
|
--flake
|
||||||
|
".#packages.aarch64-linux"
|
||||||
|
- name: Build
|
||||||
|
run: >
|
||||||
|
nix run github:Mic92/nix-fast-build
|
||||||
|
-- --skip-cached --no-nom
|
||||||
|
--systems aarch64-linux
|
||||||
|
--flake
|
||||||
|
".#checks.aarch64-linux"
|
68
.github/workflows/nix-ci.yml
vendored
Normal file
68
.github/workflows/nix-ci.yml
vendored
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
name: Nix CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
nix-eval:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ ubuntu-latest, macos-latest ]
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: List all flake outputs
|
||||||
|
run: nix flake show --all-systems
|
||||||
|
- name: Show all output paths
|
||||||
|
run: >
|
||||||
|
nix run github:nix-community/nix-eval-jobs
|
||||||
|
-- --gc-roots-dir gcroot
|
||||||
|
--flake
|
||||||
|
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||||
|
nix-build:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ ubuntu-latest, macos-latest ]
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: Set-up cachix to push the results to
|
||||||
|
uses: cachix/cachix-action@v13
|
||||||
|
with:
|
||||||
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
|
name: llama-cpp
|
||||||
|
- name: Build
|
||||||
|
run: >
|
||||||
|
nix run github:Mic92/nix-fast-build
|
||||||
|
-- --skip-cached --no-nom
|
||||||
|
--flake
|
||||||
|
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
name: update-flake-lock
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lockfile:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@main
|
||||||
|
- name: Update flake.lock
|
||||||
|
uses: DeterminateSystems/update-flake-lock@main
|
||||||
|
with:
|
||||||
|
pr-title: "nix: update flake.lock"
|
||||||
|
pr-labels: |
|
||||||
|
nix
|
||||||
|
pr-reviewers: philiptaron,SomeoneSerge
|
||||||
|
token: ${{ secrets.FLAKE_TOKEN }}
|
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
||||||
|
name: "Publish a flake to flakestry & flakehub"
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- "*"
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
tag:
|
||||||
|
description: "The existing tag to publish"
|
||||||
|
type: "string"
|
||||||
|
required: true
|
||||||
|
jobs:
|
||||||
|
flakestry-publish:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
id-token: "write"
|
||||||
|
contents: "read"
|
||||||
|
steps:
|
||||||
|
- uses: flakestry/flakestry-publish@main
|
||||||
|
with:
|
||||||
|
version: "${{ inputs.tag || github.ref_name }}"
|
||||||
|
flakehub-publish:
|
||||||
|
runs-on: "ubuntu-latest"
|
||||||
|
permissions:
|
||||||
|
id-token: "write"
|
||||||
|
contents: "read"
|
||||||
|
steps:
|
||||||
|
- uses: "actions/checkout@v4"
|
||||||
|
with:
|
||||||
|
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
||||||
|
- uses: "DeterminateSystems/nix-installer-action@main"
|
||||||
|
- uses: "DeterminateSystems/flakehub-push@main"
|
||||||
|
with:
|
||||||
|
visibility: "public"
|
||||||
|
tag: "${{ inputs.tag }}"
|
14
.github/workflows/python-check-requirements.yml
vendored
14
.github/workflows/python-check-requirements.yml
vendored
|
@ -6,17 +6,15 @@ on:
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
- '.github/workflows/python-check-requirements.yml'
|
||||||
- 'scripts/check-requirements.sh'
|
- 'scripts/check-requirements.sh'
|
||||||
- 'convert*.py'
|
- 'convert*.py'
|
||||||
- '**/requirements*.txt'
|
- 'requirements.txt'
|
||||||
|
- 'requirements/*.txt'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
- '.github/workflows/python-check-requirements.yml'
|
||||||
- 'scripts/check-requirements.sh'
|
- 'scripts/check-requirements.sh'
|
||||||
- 'convert*.py'
|
- 'convert*.py'
|
||||||
- '**/requirements*.txt'
|
- 'requirements.txt'
|
||||||
|
- 'requirements/*.txt'
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
python-check-requirements:
|
python-check-requirements:
|
||||||
|
@ -24,9 +22,9 @@ jobs:
|
||||||
name: check-requirements
|
name: check-requirements
|
||||||
steps:
|
steps:
|
||||||
- name: Check out source repository
|
- name: Check out source repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
- name: Set up Python environment
|
- name: Set up Python environment
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- name: Run check-requirements.sh script
|
- name: Run check-requirements.sh script
|
||||||
|
|
20
.github/workflows/python-lint.yml
vendored
20
.github/workflows/python-lint.yml
vendored
|
@ -1,17 +1,6 @@
|
||||||
name: flake8 Lint
|
name: flake8 Lint
|
||||||
|
|
||||||
on:
|
on: [push, pull_request]
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
flake8-lint:
|
flake8-lint:
|
||||||
|
@ -19,12 +8,13 @@ jobs:
|
||||||
name: Lint
|
name: Lint
|
||||||
steps:
|
steps:
|
||||||
- name: Check out source repository
|
- name: Check out source repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
- name: Set up Python environment
|
- name: Set up Python environment
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- name: flake8 Lint
|
- name: flake8 Lint
|
||||||
uses: py-actions/flake8@v2
|
uses: py-actions/flake8@v2
|
||||||
with:
|
with:
|
||||||
plugins: "flake8-no-print"
|
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
|
||||||
|
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
||||||
|
|
40
.github/workflows/python-type-check.yml
vendored
40
.github/workflows/python-type-check.yml
vendored
|
@ -1,40 +0,0 @@
|
||||||
name: Python Type-Check
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/python-type-check.yml'
|
|
||||||
- 'pyrightconfig.json'
|
|
||||||
- '**.py'
|
|
||||||
- '**/requirements*.txt'
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/python-type-check.yml'
|
|
||||||
- 'pyrightconfig.json'
|
|
||||||
- '**.py'
|
|
||||||
- '**/requirements*.txt'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
python-type-check:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
name: pyright type-check
|
|
||||||
steps:
|
|
||||||
- name: Check out source repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Set up Python environment
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Install Python dependencies
|
|
||||||
# TODO: use a venv
|
|
||||||
run: pip install -r requirements/requirements-all.txt
|
|
||||||
- name: Type-check with Pyright
|
|
||||||
uses: jakebailey/pyright-action@v2
|
|
||||||
with:
|
|
||||||
version: 1.1.382
|
|
||||||
level: warning
|
|
||||||
warnings: true
|
|
182
.github/workflows/server.yml
vendored
182
.github/workflows/server.yml
vendored
|
@ -4,10 +4,6 @@ name: Server
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # allows manual triggering
|
workflow_dispatch: # allows manual triggering
|
||||||
inputs:
|
inputs:
|
||||||
sha:
|
|
||||||
description: 'Commit SHA1 to build'
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
slow_tests:
|
slow_tests:
|
||||||
description: 'Run slow tests'
|
description: 'Run slow tests'
|
||||||
required: true
|
required: true
|
||||||
|
@ -15,20 +11,12 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||||
|
schedule:
|
||||||
env:
|
- cron: '0 0 * * *'
|
||||||
LLAMA_LOG_COLORS: 1
|
|
||||||
LLAMA_LOG_PREFIX: 1
|
|
||||||
LLAMA_LOG_TIMESTAMPS: 1
|
|
||||||
LLAMA_LOG_VERBOSITY: 10
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
server:
|
server:
|
||||||
|
@ -36,160 +24,84 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
# TODO: temporary disabled due to linux kernel issues
|
||||||
build_type: [RelWithDebInfo]
|
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
|
sanitizer: [UNDEFINED]
|
||||||
|
build_type: [Debug]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
sanitizer: ""
|
||||||
|
disabled_on_pr: true
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||||
|
|
||||||
|
container:
|
||||||
|
image: ubuntu:latest
|
||||||
|
ports:
|
||||||
|
- 8888
|
||||||
|
options: --cpus 4
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
apt-get update
|
||||||
sudo apt-get -y install \
|
apt-get -y install \
|
||||||
build-essential \
|
build-essential \
|
||||||
xxd \
|
|
||||||
git \
|
git \
|
||||||
cmake \
|
cmake \
|
||||||
curl \
|
python3-pip \
|
||||||
wget \
|
wget \
|
||||||
language-pack-en \
|
language-pack-en \
|
||||||
libcurl4-openssl-dev
|
libcurl4-openssl-dev
|
||||||
|
|
||||||
- name: Clone
|
- name: Build
|
||||||
id: checkout
|
id: cmake_build
|
||||||
uses: actions/checkout@v4
|
run: |
|
||||||
with:
|
mkdir build
|
||||||
fetch-depth: 0
|
cd build
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
cmake .. \
|
||||||
|
-DLLAMA_NATIVE=OFF \
|
||||||
- name: Python setup
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
id: setup_python
|
-DLLAMA_CURL=ON \
|
||||||
uses: actions/setup-python@v5
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
with:
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
python-version: '3.11'
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||||
|
|
||||||
- name: Tests dependencies
|
- name: Tests dependencies
|
||||||
id: test_dependencies
|
id: test_dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/server/tests/requirements.txt
|
pip install -r examples/server/tests/requirements.txt
|
||||||
|
|
||||||
# Setup nodejs (to be used for verifying bundled index.html)
|
|
||||||
- uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: '22.11.0'
|
|
||||||
|
|
||||||
- name: WebUI - Install dependencies
|
|
||||||
id: webui_lint
|
|
||||||
run: |
|
|
||||||
cd examples/server/webui
|
|
||||||
npm ci
|
|
||||||
|
|
||||||
- name: WebUI - Check code format
|
|
||||||
id: webui_format
|
|
||||||
run: |
|
|
||||||
git config --global --add safe.directory $(realpath .)
|
|
||||||
cd examples/server/webui
|
|
||||||
git status
|
|
||||||
|
|
||||||
npm run format
|
|
||||||
git status
|
|
||||||
modified_files="$(git status -s)"
|
|
||||||
echo "Modified files: ${modified_files}"
|
|
||||||
if [ -n "${modified_files}" ]; then
|
|
||||||
echo "Files do not follow coding style. To fix: npm run format"
|
|
||||||
echo "${modified_files}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Verify bundled index.html
|
|
||||||
id: verify_server_index_html
|
|
||||||
run: |
|
|
||||||
git config --global --add safe.directory $(realpath .)
|
|
||||||
cd examples/server/webui
|
|
||||||
git status
|
|
||||||
|
|
||||||
npm run build
|
|
||||||
git status
|
|
||||||
modified_files="$(git status -s)"
|
|
||||||
echo "Modified files: ${modified_files}"
|
|
||||||
if [ -n "${modified_files}" ]; then
|
|
||||||
echo "Repository is dirty or server/webui is not built as expected"
|
|
||||||
echo "Hint: You may need to follow Web UI build guide in server/README.md"
|
|
||||||
echo "${modified_files}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Build (no OpenMP)
|
|
||||||
id: cmake_build_no_openmp
|
|
||||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
|
||||||
-DGGML_OPENMP=OFF ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build
|
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
./tests.sh
|
PORT=8888 ./tests.sh
|
||||||
|
|
||||||
- name: Tests (sanitizers)
|
|
||||||
id: server_integration_tests_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
LLAMA_SANITIZE=1 ./tests.sh
|
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
SLOW_TESTS=1 ./tests.sh
|
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
runs-on: windows-2019
|
runs-on: windows-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: libCURL
|
- name: libCURL
|
||||||
id: get_libcurl
|
id: get_libcurl
|
||||||
|
@ -203,8 +115,10 @@ jobs:
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
mkdir build
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
cd build
|
||||||
|
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||||
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
id: setup_python
|
id: setup_python
|
||||||
|
@ -227,13 +141,11 @@ jobs:
|
||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
$env:PYTHONIOENCODING = ":replace"
|
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||||
pytest -v -x -m "not slow"
|
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
$env:SLOW_TESTS = "1"
|
behave.exe --stop --no-skipped --no-capture --tags slow
|
||||||
pytest -v -x
|
|
||||||
|
|
25
.github/workflows/zig-build.yml
vendored
Normal file
25
.github/workflows/zig-build.yml
vendored
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
name: Zig CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
runs-on: [ubuntu-latest, macos-latest, windows-latest]
|
||||||
|
runs-on: ${{ matrix.runs-on }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
fetch-depth: 0
|
||||||
|
- uses: goto-bus-stop/setup-zig@v2
|
||||||
|
with:
|
||||||
|
version: 0.11.0
|
||||||
|
- name: Build Summary
|
||||||
|
run: zig build --summary all -freference-trace
|
188
.gitignore
vendored
188
.gitignore
vendored
|
@ -1,145 +1,97 @@
|
||||||
# Extensions
|
|
||||||
|
|
||||||
*.a
|
|
||||||
*.bat
|
|
||||||
*.bin
|
|
||||||
*.d
|
|
||||||
*.dll
|
|
||||||
*.dot
|
|
||||||
*.etag
|
|
||||||
*.exe
|
|
||||||
*.gcda
|
|
||||||
*.gcno
|
|
||||||
*.gcov
|
|
||||||
*.gguf
|
|
||||||
*.gguf.json
|
|
||||||
*.lastModified
|
|
||||||
*.log
|
|
||||||
*.metallib
|
|
||||||
*.o
|
*.o
|
||||||
|
*.a
|
||||||
*.so
|
*.so
|
||||||
*.swp
|
*.gguf
|
||||||
*.tmp
|
*.bin
|
||||||
|
*.exe
|
||||||
# IDE / OS
|
*.dll
|
||||||
|
*.log
|
||||||
|
*.gcov
|
||||||
|
*.gcno
|
||||||
|
*.gcda
|
||||||
|
*.dot
|
||||||
|
*.bat
|
||||||
|
*.metallib
|
||||||
|
.DS_Store
|
||||||
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
.ccls-cache/
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
.DS_Store
|
|
||||||
.envrc
|
.envrc
|
||||||
.idea/
|
|
||||||
.swiftpm
|
.swiftpm
|
||||||
|
.venv
|
||||||
|
.clang-tidy
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
nppBackup
|
.idea/
|
||||||
|
|
||||||
|
|
||||||
# Coverage
|
|
||||||
|
|
||||||
gcovr-report/
|
|
||||||
lcov-report/
|
|
||||||
|
|
||||||
# Build Artifacts
|
|
||||||
|
|
||||||
tags
|
|
||||||
.build/
|
|
||||||
build*
|
|
||||||
!build-info.cmake
|
|
||||||
!build-info.cpp.in
|
|
||||||
!build-info.sh
|
|
||||||
!build.zig
|
|
||||||
!docs/build.md
|
|
||||||
/libllama.so
|
|
||||||
/llama-*
|
|
||||||
/vulkan-shaders-gen
|
|
||||||
android-ndk-*
|
|
||||||
arm_neon.h
|
|
||||||
cmake-build-*
|
|
||||||
CMakeSettings.json
|
|
||||||
compile_commands.json
|
|
||||||
ggml-metal-embed.metal
|
ggml-metal-embed.metal
|
||||||
llama-batched-swift
|
|
||||||
/rpc-server
|
lcov-report/
|
||||||
|
gcovr-report/
|
||||||
|
|
||||||
|
build*
|
||||||
|
cmake-build-*
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
autogen-*.md
|
|
||||||
|
|
||||||
# Deprecated
|
|
||||||
|
|
||||||
/main
|
|
||||||
/server
|
|
||||||
|
|
||||||
# CI
|
|
||||||
|
|
||||||
!.github/workflows/*.yml
|
|
||||||
|
|
||||||
# Models
|
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
models-mnt
|
models-mnt
|
||||||
!models/.editorconfig
|
|
||||||
!models/ggml-vocab-*.gguf*
|
|
||||||
|
|
||||||
# Zig
|
/Pipfile
|
||||||
|
/baby-llama
|
||||||
|
/beam-search
|
||||||
|
/benchmark-matmult
|
||||||
|
/convert-llama2c-to-ggml
|
||||||
|
/embd-input-test
|
||||||
|
/embedding
|
||||||
|
/gguf
|
||||||
|
/gguf-llama-simple
|
||||||
|
/gritlm
|
||||||
|
/imatrix
|
||||||
|
/infill
|
||||||
|
/libllama.so
|
||||||
|
/llama-bench
|
||||||
|
/llava-cli
|
||||||
|
/lookahead
|
||||||
|
/lookup
|
||||||
|
/main
|
||||||
|
/metal
|
||||||
|
/passkey
|
||||||
|
/perplexity
|
||||||
|
/q8dot
|
||||||
|
/quantize
|
||||||
|
/quantize-stats
|
||||||
|
/result
|
||||||
|
/save-load-state
|
||||||
|
/server
|
||||||
|
/simple
|
||||||
|
/batched
|
||||||
|
/batched-bench
|
||||||
|
/export-lora
|
||||||
|
/finetune
|
||||||
|
/speculative
|
||||||
|
/parallel
|
||||||
|
/train-text-from-scratch
|
||||||
|
/tokenize
|
||||||
|
/vdot
|
||||||
|
/common/build-info.cpp
|
||||||
|
arm_neon.h
|
||||||
|
compile_commands.json
|
||||||
|
CMakeSettings.json
|
||||||
|
|
||||||
|
__pycache__
|
||||||
|
dist
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
# Logs
|
|
||||||
|
|
||||||
ppl-*.txt
|
ppl-*.txt
|
||||||
qnt-*.txt
|
qnt-*.txt
|
||||||
perf-*.txt
|
perf-*.txt
|
||||||
|
|
||||||
# Examples
|
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
examples/server/*.css.hpp
|
|
||||||
examples/server/*.html.hpp
|
|
||||||
examples/server/*.js.hpp
|
|
||||||
examples/server/*.mjs.hpp
|
|
||||||
!build_64.sh
|
|
||||||
!examples/*.bat
|
|
||||||
!examples/*/*.kts
|
|
||||||
!examples/*/*/*.kts
|
|
||||||
!examples/sycl/*.bat
|
|
||||||
!examples/sycl/*.sh
|
|
||||||
|
|
||||||
# Server Web UI temporary files
|
poetry.lock
|
||||||
node_modules
|
|
||||||
examples/server/webui/dist
|
|
||||||
|
|
||||||
# Python
|
|
||||||
|
|
||||||
/.venv
|
|
||||||
__pycache__/
|
|
||||||
*/poetry.lock
|
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
nppBackup
|
||||||
# Nix
|
|
||||||
/result
|
|
||||||
|
|
||||||
# Test binaries
|
|
||||||
/tests/test-backend-ops
|
|
||||||
/tests/test-double-float
|
|
||||||
/tests/test-grad0
|
|
||||||
/tests/test-grammar-parser
|
|
||||||
/tests/test-llama-grammar
|
|
||||||
/tests/test-opt
|
|
||||||
/tests/test-quantize-fns
|
|
||||||
/tests/test-quantize-perf
|
|
||||||
/tests/test-rope
|
|
||||||
/tests/test-sampling
|
|
||||||
/tests/test-tokenizer-0
|
|
||||||
/tests/test-tokenizer-1-bpe
|
|
||||||
/tests/test-tokenizer-1-spm
|
|
||||||
|
|
||||||
# Scripts
|
|
||||||
!/scripts/install-oneapi.bat
|
|
||||||
|
|
||||||
# Test models for lora adapters
|
|
||||||
/lora-tests
|
|
||||||
|
|
||||||
# Local scripts
|
|
||||||
/run-vim.sh
|
|
||||||
/run-chat.sh
|
|
||||||
|
|
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
||||||
[submodule "kompute"]
|
[submodule "kompute"]
|
||||||
path = ggml/src/ggml-kompute/kompute
|
path = kompute
|
||||||
url = https://github.com/nomic-ai/kompute.git
|
url = https://github.com/nomic-ai/kompute.git
|
||||||
|
|
|
@ -3,14 +3,13 @@
|
||||||
exclude: prompts/.*.txt
|
exclude: prompts/.*.txt
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.6.0
|
rev: v3.2.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: check-added-large-files
|
- id: check-added-large-files
|
||||||
- repo: https://github.com/PyCQA/flake8
|
- repo: https://github.com/PyCQA/flake8
|
||||||
rev: 7.0.0
|
rev: 6.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
additional_dependencies: [flake8-no-print]
|
|
||||||
|
|
1269
CMakeLists.txt
1269
CMakeLists.txt
File diff suppressed because it is too large
Load diff
|
@ -1,97 +0,0 @@
|
||||||
{
|
|
||||||
"version": 4,
|
|
||||||
"configurePresets": [
|
|
||||||
{
|
|
||||||
"name": "base",
|
|
||||||
"hidden": true,
|
|
||||||
"generator": "Ninja",
|
|
||||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "sycl-base",
|
|
||||||
"hidden": true,
|
|
||||||
"generator": "Ninja",
|
|
||||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
|
||||||
"CMAKE_CXX_COMPILER": "icx",
|
|
||||||
"CMAKE_C_COMPILER": "cl",
|
|
||||||
"GGML_SYCL": "ON",
|
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
|
||||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
|
||||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
|
||||||
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "x64-windows-llvm", "hidden": true,
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "arm64-windows-llvm", "hidden": true,
|
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "arm64-apple-clang", "hidden": true,
|
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
|
||||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
|
||||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
|
||||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
|
||||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
|
||||||
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
|
|
||||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
|
||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
|
|
||||||
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
|
||||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
|
||||||
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
|
|
||||||
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
|
|
||||||
]
|
|
||||||
}
|
|
11
CODEOWNERS
11
CODEOWNERS
|
@ -1,11 +0,0 @@
|
||||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
|
||||||
|
|
||||||
/ci/ @ggerganov
|
|
||||||
/.devops/*.Dockerfile @ngxson
|
|
||||||
/examples/server/ @ngxson
|
|
||||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
|
||||||
/ggml/src/gguf.cpp @JohannesGaessler
|
|
125
CONTRIBUTING.md
125
CONTRIBUTING.md
|
@ -1,125 +0,0 @@
|
||||||
# Pull requests (for contributors)
|
|
||||||
|
|
||||||
- Test your changes:
|
|
||||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
|
||||||
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
|
||||||
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
|
||||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
|
||||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
|
||||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
|
||||||
|
|
||||||
# Pull requests (for collaborators)
|
|
||||||
|
|
||||||
- Squash-merge PRs
|
|
||||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
|
||||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
|
||||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
|
|
||||||
|
|
||||||
# Coding guidelines
|
|
||||||
|
|
||||||
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
|
||||||
- Always consider cross-compatibility with other operating systems and architectures
|
|
||||||
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
|
|
||||||
- Vertical alignment makes things more readable and easier to batch edit
|
|
||||||
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
|
||||||
- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
|
|
||||||
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
|
|
||||||
- In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
|
|
||||||
```cpp
|
|
||||||
// OK
|
|
||||||
llama_context * ctx;
|
|
||||||
const llama_rope_type rope_type;
|
|
||||||
|
|
||||||
// not OK
|
|
||||||
struct llama_context * ctx;
|
|
||||||
const enum llama_rope_type rope_type;
|
|
||||||
```
|
|
||||||
|
|
||||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
|
|
||||||
|
|
||||||
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
|
|
||||||
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
|
|
||||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
|
||||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
# Naming guidelines
|
|
||||||
|
|
||||||
- Use `snake_case` for function, variable and type names
|
|
||||||
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
// not OK
|
|
||||||
int small_number;
|
|
||||||
int big_number;
|
|
||||||
|
|
||||||
// OK
|
|
||||||
int number_small;
|
|
||||||
int number_big;
|
|
||||||
```
|
|
||||||
|
|
||||||
- Enum values are always in upper case and prefixed with the enum name
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
enum llama_vocab_type {
|
|
||||||
LLAMA_VOCAB_TYPE_NONE = 0,
|
|
||||||
LLAMA_VOCAB_TYPE_SPM = 1,
|
|
||||||
LLAMA_VOCAB_TYPE_BPE = 2,
|
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3,
|
|
||||||
LLAMA_VOCAB_TYPE_UGM = 4,
|
|
||||||
LLAMA_VOCAB_TYPE_RWKV = 5,
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
llama_model_init(); // class: "llama_model", method: "init"
|
|
||||||
llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
|
|
||||||
llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed"
|
|
||||||
llama_set_embeddings(); // class: "llama_context", method: "set_embeddings"
|
|
||||||
llama_n_threads(); // class: "llama_context", method: "n_threads"
|
|
||||||
llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free"
|
|
||||||
```
|
|
||||||
|
|
||||||
- The `get` `<action>` can be omitted
|
|
||||||
- The `<noun>` can be omitted if not necessary
|
|
||||||
- The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
|
|
||||||
- Use `init`/`free` for constructor/destructor `<action>`
|
|
||||||
|
|
||||||
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
typedef struct llama_context * llama_context_t;
|
|
||||||
|
|
||||||
enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
|
|
||||||
```
|
|
||||||
|
|
||||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
|
|
||||||
|
|
||||||
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
|
|
||||||
- Python filenames are all lowercase with underscores
|
|
||||||
|
|
||||||
- _(TODO: abbreviations usage)_
|
|
||||||
|
|
||||||
# Preprocessor directives
|
|
||||||
|
|
||||||
- _(TODO: add guidelines with examples and apply them to the codebase)_
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
#ifdef FOO
|
|
||||||
#endif // FOO
|
|
||||||
```
|
|
||||||
|
|
||||||
# Documentation
|
|
||||||
|
|
||||||
- Documentation is a community effort
|
|
||||||
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
|
|
||||||
- When you notice incorrect or outdated documentation, please update it
|
|
||||||
|
|
||||||
# Resources
|
|
||||||
|
|
||||||
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
|
|
||||||
|
|
||||||
https://github.com/ggerganov/llama.cpp/projects
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2023-2024 The ggml authors
|
Copyright (c) 2023 Georgi Gerganov
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -14,6 +14,48 @@ let package = Package(
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "llama", targets: ["llama"]),
|
||||||
],
|
],
|
||||||
targets: [
|
targets: [
|
||||||
.systemLibrary(name: "llama", pkgConfig: "llama"),
|
.target(
|
||||||
]
|
name: "llama",
|
||||||
|
path: ".",
|
||||||
|
exclude: [
|
||||||
|
"cmake",
|
||||||
|
"examples",
|
||||||
|
"scripts",
|
||||||
|
"models",
|
||||||
|
"tests",
|
||||||
|
"CMakeLists.txt",
|
||||||
|
"ggml-cuda.cu",
|
||||||
|
"ggml-cuda.h",
|
||||||
|
"Makefile"
|
||||||
|
],
|
||||||
|
sources: [
|
||||||
|
"ggml.c",
|
||||||
|
"llama.cpp",
|
||||||
|
"unicode.cpp",
|
||||||
|
"ggml-alloc.c",
|
||||||
|
"ggml-backend.c",
|
||||||
|
"ggml-quants.c",
|
||||||
|
"ggml-metal.m",
|
||||||
|
],
|
||||||
|
resources: [
|
||||||
|
.process("ggml-metal.metal")
|
||||||
|
],
|
||||||
|
publicHeadersPath: "spm-headers",
|
||||||
|
cSettings: [
|
||||||
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
|
.define("GGML_USE_ACCELERATE"),
|
||||||
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
|
.define("GGML_USE_METAL"),
|
||||||
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
|
],
|
||||||
|
linkerSettings: [
|
||||||
|
.linkedFramework("Accelerate")
|
||||||
|
]
|
||||||
|
)
|
||||||
|
],
|
||||||
|
cxxLanguageStandard: .cxx11
|
||||||
)
|
)
|
||||||
|
|
541
README-sycl.md
Normal file
541
README-sycl.md
Normal file
|
@ -0,0 +1,541 @@
|
||||||
|
# llama.cpp for SYCL
|
||||||
|
|
||||||
|
- [Background](#background)
|
||||||
|
- [News](#news)
|
||||||
|
- [OS](#os)
|
||||||
|
- [Intel GPU](#intel-gpu)
|
||||||
|
- [Docker](#docker)
|
||||||
|
- [Linux](#linux)
|
||||||
|
- [Windows](#windows)
|
||||||
|
- [Environment Variable](#environment-variable)
|
||||||
|
- [Known Issue](#known-issue)
|
||||||
|
- [Q&A](#q&a)
|
||||||
|
- [Todo](#todo)
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||||
|
|
||||||
|
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||||
|
|
||||||
|
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||||
|
|
||||||
|
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||||
|
|
||||||
|
The llama.cpp for SYCL is used to support Intel GPUs.
|
||||||
|
|
||||||
|
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||||
|
|
||||||
|
## News
|
||||||
|
|
||||||
|
- 2024.3
|
||||||
|
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
||||||
|
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
||||||
|
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
||||||
|
- Support OPs
|
||||||
|
- hardsigmoid
|
||||||
|
- hardswish
|
||||||
|
- pool2d
|
||||||
|
|
||||||
|
- 2024.1
|
||||||
|
- Create SYCL backend for Intel GPU.
|
||||||
|
- Support Windows build
|
||||||
|
|
||||||
|
## OS
|
||||||
|
|
||||||
|
|OS|Status|Verified|
|
||||||
|
|-|-|-|
|
||||||
|
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|
||||||
|
|Windows|Support|Windows 11|
|
||||||
|
|
||||||
|
|
||||||
|
## Intel GPU
|
||||||
|
|
||||||
|
### Verified
|
||||||
|
|
||||||
|
|Intel GPU| Status | Verified Model|
|
||||||
|
|-|-|-|
|
||||||
|
|Intel Data Center Max Series| Support| Max 1550|
|
||||||
|
|Intel Data Center Flex Series| Support| Flex 170|
|
||||||
|
|Intel Arc Series| Support| Arc 770, 730M|
|
||||||
|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||||
|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
|
||||||
|
|
||||||
|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
|
||||||
|
|
||||||
|
### Memory
|
||||||
|
|
||||||
|
The memory is a limitation to run LLM on GPUs.
|
||||||
|
|
||||||
|
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
|
||||||
|
|
||||||
|
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
|
||||||
|
|
||||||
|
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
|
||||||
|
|
||||||
|
## Nvidia GPU
|
||||||
|
|
||||||
|
### Verified
|
||||||
|
|
||||||
|
|Intel GPU| Status | Verified Model|
|
||||||
|
|-|-|-|
|
||||||
|
|Ampere Series| Support| A100|
|
||||||
|
|
||||||
|
### oneMKL
|
||||||
|
|
||||||
|
The current oneMKL release does not contain the oneMKL cuBlas backend.
|
||||||
|
As a result for Nvidia GPU's oneMKL must be built from source.
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone https://github.com/oneapi-src/oneMKL
|
||||||
|
cd oneMKL
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
|
||||||
|
ninja
|
||||||
|
// Add paths as necessary
|
||||||
|
```
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- Only docker on Linux is tested. Docker on WSL may not work.
|
||||||
|
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
|
||||||
|
|
||||||
|
### Build the image
|
||||||
|
|
||||||
|
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
|
||||||
|
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# For F16:
|
||||||
|
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
||||||
|
|
||||||
|
# Or, for F32:
|
||||||
|
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
|
||||||
|
|
||||||
|
# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Firstly, find all the DRI cards:
|
||||||
|
ls -la /dev/dri
|
||||||
|
# Then, pick the card that you want to use.
|
||||||
|
|
||||||
|
# For example with "/dev/dri/card1"
|
||||||
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
```
|
||||||
|
|
||||||
|
## Linux
|
||||||
|
|
||||||
|
### Setup Environment
|
||||||
|
|
||||||
|
1. Install Intel GPU driver.
|
||||||
|
|
||||||
|
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||||
|
|
||||||
|
Note: for iGPU, please install the client GPU driver.
|
||||||
|
|
||||||
|
b. Add user to group: video, render.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo usermod -aG render username
|
||||||
|
sudo usermod -aG video username
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: re-login to enable it.
|
||||||
|
|
||||||
|
c. Check
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo apt install clinfo
|
||||||
|
sudo clinfo -l
|
||||||
|
```
|
||||||
|
|
||||||
|
Output (example):
|
||||||
|
|
||||||
|
```
|
||||||
|
Platform #0: Intel(R) OpenCL Graphics
|
||||||
|
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||||
|
|
||||||
|
|
||||||
|
Platform #0: Intel(R) OpenCL HD Graphics
|
||||||
|
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install Intel® oneAPI Base toolkit.
|
||||||
|
|
||||||
|
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||||
|
|
||||||
|
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||||
|
|
||||||
|
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||||
|
|
||||||
|
b. Check
|
||||||
|
|
||||||
|
```sh
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
sycl-ls
|
||||||
|
```
|
||||||
|
|
||||||
|
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
|
||||||
|
|
||||||
|
Output (example):
|
||||||
|
```
|
||||||
|
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||||
|
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||||
|
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||||
|
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Build locally:
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
|
||||||
|
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
# For FP16:
|
||||||
|
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
|
# Or, for FP32:
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
# For Nvidia GPUs
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
# Build example/main only
|
||||||
|
#cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
# Or, build all binary
|
||||||
|
cmake --build . --config Release -v
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./examples/sycl/build.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
1. Put model file to folder **models**
|
||||||
|
|
||||||
|
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
|
||||||
|
|
||||||
|
2. Enable oneAPI running environment
|
||||||
|
|
||||||
|
```
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
3. List device ID
|
||||||
|
|
||||||
|
Run without parameter:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./build/bin/ls-sycl-device
|
||||||
|
|
||||||
|
# or running the "main" executable and look at the output log:
|
||||||
|
|
||||||
|
./build/bin/main
|
||||||
|
```
|
||||||
|
|
||||||
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
|
```
|
||||||
|
found 4 SYCL devices:
|
||||||
|
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||||
|
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||||
|
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|Attribute|Note|
|
||||||
|
|-|-|
|
||||||
|
|compute capability 1.3|Level-zero running time, recommended |
|
||||||
|
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||||
|
|
||||||
|
4. Set device ID and execute llama.cpp
|
||||||
|
|
||||||
|
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||||
|
|
||||||
|
```sh
|
||||||
|
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
```
|
||||||
|
or run by script:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./examples/sycl/run_llama2.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||||
|
|
||||||
|
|
||||||
|
5. Check the device ID in output
|
||||||
|
|
||||||
|
Like:
|
||||||
|
```
|
||||||
|
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||||
|
```
|
||||||
|
|
||||||
|
## Windows
|
||||||
|
|
||||||
|
### Setup Environment
|
||||||
|
|
||||||
|
1. Install Intel GPU driver.
|
||||||
|
|
||||||
|
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
||||||
|
|
||||||
|
Note: **The driver is mandatory for compute function**.
|
||||||
|
|
||||||
|
2. Install Visual Studio.
|
||||||
|
|
||||||
|
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
|
||||||
|
|
||||||
|
3. Install Intel® oneAPI Base toolkit.
|
||||||
|
|
||||||
|
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||||
|
|
||||||
|
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
|
||||||
|
|
||||||
|
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||||
|
|
||||||
|
b. Enable oneAPI running environment:
|
||||||
|
|
||||||
|
- In Search, input 'oneAPI'.
|
||||||
|
|
||||||
|
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
|
||||||
|
|
||||||
|
- In Run:
|
||||||
|
|
||||||
|
In CMD:
|
||||||
|
```
|
||||||
|
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||||
|
```
|
||||||
|
|
||||||
|
c. Check GPU
|
||||||
|
|
||||||
|
In oneAPI command line:
|
||||||
|
|
||||||
|
```
|
||||||
|
sycl-ls
|
||||||
|
```
|
||||||
|
|
||||||
|
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
|
||||||
|
|
||||||
|
Output (example):
|
||||||
|
```
|
||||||
|
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||||
|
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||||
|
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
|
||||||
|
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Install cmake & make
|
||||||
|
|
||||||
|
a. Download & install cmake for Windows: https://cmake.org/download/
|
||||||
|
|
||||||
|
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
||||||
|
|
||||||
|
- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||||
|
|
||||||
|
- Extract `w64devkit` on your pc.
|
||||||
|
|
||||||
|
- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
|
||||||
|
|
||||||
|
### Build locally:
|
||||||
|
|
||||||
|
In oneAPI command line window:
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
|
:: for FP16
|
||||||
|
:: faster for long-prompt inference
|
||||||
|
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
|
:: for FP32
|
||||||
|
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
|
|
||||||
|
:: build example/main only
|
||||||
|
:: make main
|
||||||
|
|
||||||
|
:: build all binary
|
||||||
|
make -j
|
||||||
|
cd ..
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
.\examples\sycl\win-build-sycl.bat
|
||||||
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
1. Put model file to folder **models**
|
||||||
|
|
||||||
|
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
|
||||||
|
|
||||||
|
2. Enable oneAPI running environment
|
||||||
|
|
||||||
|
- In Search, input 'oneAPI'.
|
||||||
|
|
||||||
|
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
|
||||||
|
|
||||||
|
- In Run:
|
||||||
|
|
||||||
|
In CMD:
|
||||||
|
```
|
||||||
|
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||||
|
```
|
||||||
|
|
||||||
|
3. List device ID
|
||||||
|
|
||||||
|
Run without parameter:
|
||||||
|
|
||||||
|
```
|
||||||
|
build\bin\ls-sycl-device.exe
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
build\bin\main.exe
|
||||||
|
```
|
||||||
|
|
||||||
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
|
```
|
||||||
|
found 4 SYCL devices:
|
||||||
|
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||||
|
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||||
|
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|Attribute|Note|
|
||||||
|
|-|-|
|
||||||
|
|compute capability 1.3|Level-zero running time, recommended |
|
||||||
|
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||||
|
|
||||||
|
4. Set device ID and execute llama.cpp
|
||||||
|
|
||||||
|
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
|
||||||
|
|
||||||
|
```
|
||||||
|
set GGML_SYCL_DEVICE=0
|
||||||
|
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
|
||||||
|
```
|
||||||
|
or run by script:
|
||||||
|
|
||||||
|
```
|
||||||
|
.\examples\sycl\win-run-llama2.bat
|
||||||
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||||
|
|
||||||
|
|
||||||
|
5. Check the device ID in output
|
||||||
|
|
||||||
|
Like:
|
||||||
|
```
|
||||||
|
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||||
|
```
|
||||||
|
|
||||||
|
## Environment Variable
|
||||||
|
|
||||||
|
#### Build
|
||||||
|
|
||||||
|
|Name|Value|Function|
|
||||||
|
|-|-|-|
|
||||||
|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||||
|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||||
|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||||
|
|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
|
||||||
|
|
||||||
|
#### Running
|
||||||
|
|
||||||
|
|
||||||
|
|Name|Value|Function|
|
||||||
|
|-|-|-|
|
||||||
|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||||
|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||||
|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
|
||||||
|
|
||||||
|
## Known Issue
|
||||||
|
|
||||||
|
- Hang during startup
|
||||||
|
|
||||||
|
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||||
|
|
||||||
|
Solution: add **--no-mmap** or **--mmap 0**.
|
||||||
|
|
||||||
|
- Split-mode: [row] is not supported
|
||||||
|
|
||||||
|
It's on developing.
|
||||||
|
|
||||||
|
## Q&A
|
||||||
|
|
||||||
|
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||||
|
|
||||||
|
Miss to enable oneAPI running environment.
|
||||||
|
|
||||||
|
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||||
|
|
||||||
|
- In Windows, no result, not error.
|
||||||
|
|
||||||
|
Miss to enable oneAPI running environment.
|
||||||
|
|
||||||
|
- Meet compile error.
|
||||||
|
|
||||||
|
Remove folder **build** and try again.
|
||||||
|
|
||||||
|
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
|
||||||
|
|
||||||
|
Please run **sudo sycl-ls**.
|
||||||
|
|
||||||
|
If you see it in result, please add video/render group to your ID:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo usermod -aG render username
|
||||||
|
sudo usermod -aG video username
|
||||||
|
```
|
||||||
|
|
||||||
|
Then **relogin**.
|
||||||
|
|
||||||
|
If you do not see it, please check the installation GPU steps again.
|
||||||
|
|
||||||
|
## Todo
|
||||||
|
|
||||||
|
- Support multiple cards.
|
67
SECURITY.md
67
SECURITY.md
|
@ -1,67 +0,0 @@
|
||||||
# Security Policy
|
|
||||||
|
|
||||||
- [**Using llama.cpp securely**](#using-llamacpp-securely)
|
|
||||||
- [Untrusted models](#untrusted-models)
|
|
||||||
- [Untrusted inputs](#untrusted-inputs)
|
|
||||||
- [Data privacy](#data-privacy)
|
|
||||||
- [Untrusted environments or networks](#untrusted-environments-or-networks)
|
|
||||||
- [Multi-Tenant environments](#multi-tenant-environments)
|
|
||||||
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
|
||||||
|
|
||||||
## Using llama.cpp securely
|
|
||||||
|
|
||||||
### Untrusted models
|
|
||||||
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
|
|
||||||
|
|
||||||
*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
|
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
|
|
||||||
|
|
||||||
### Untrusted inputs
|
|
||||||
|
|
||||||
Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
|
|
||||||
|
|
||||||
For maximum security when handling untrusted inputs, you may need to employ the following:
|
|
||||||
|
|
||||||
* Sandboxing: Isolate the environment where the inference happens.
|
|
||||||
* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
|
|
||||||
* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
|
|
||||||
* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
|
|
||||||
* Validation: Enforce strict rules on allowed characters and data types.
|
|
||||||
* Filtering: Remove potentially malicious scripts or code fragments.
|
|
||||||
* Encoding: Convert special characters into safe representations.
|
|
||||||
* Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
|
|
||||||
|
|
||||||
### Data privacy
|
|
||||||
|
|
||||||
To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
|
|
||||||
|
|
||||||
### Untrusted environments or networks
|
|
||||||
|
|
||||||
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
|
|
||||||
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
|
|
||||||
* Encrypt your data if sending it over the network.
|
|
||||||
|
|
||||||
### Multi-Tenant environments
|
|
||||||
|
|
||||||
If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
|
|
||||||
|
|
||||||
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
|
|
||||||
|
|
||||||
2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
|
|
||||||
|
|
||||||
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
|
||||||
|
|
||||||
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
|
||||||
|
|
||||||
## Reporting a vulnerability
|
|
||||||
|
|
||||||
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
|
|
||||||
|
|
||||||
<!-- normal version -->
|
|
||||||
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
|
||||||
|
|
||||||
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
|
|
||||||
|
|
||||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
|
|
@ -1,4 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <llama.h>
|
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
module llama [system] {
|
|
||||||
header "llama.h"
|
|
||||||
link "llama"
|
|
||||||
export *
|
|
||||||
}
|
|
140
build.zig
Normal file
140
build.zig
Normal file
|
@ -0,0 +1,140 @@
|
||||||
|
// Compatible with Zig Version 0.11.0
|
||||||
|
const std = @import("std");
|
||||||
|
const ArrayList = std.ArrayList;
|
||||||
|
const Compile = std.Build.Step.Compile;
|
||||||
|
const ConfigHeader = std.Build.Step.ConfigHeader;
|
||||||
|
const Mode = std.builtin.Mode;
|
||||||
|
const CrossTarget = std.zig.CrossTarget;
|
||||||
|
|
||||||
|
const Maker = struct {
|
||||||
|
builder: *std.build.Builder,
|
||||||
|
target: CrossTarget,
|
||||||
|
optimize: Mode,
|
||||||
|
enable_lto: bool,
|
||||||
|
|
||||||
|
include_dirs: ArrayList([]const u8),
|
||||||
|
cflags: ArrayList([]const u8),
|
||||||
|
cxxflags: ArrayList([]const u8),
|
||||||
|
objs: ArrayList(*Compile),
|
||||||
|
|
||||||
|
fn addInclude(m: *Maker, dir: []const u8) !void {
|
||||||
|
try m.include_dirs.append(dir);
|
||||||
|
}
|
||||||
|
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
|
||||||
|
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
|
||||||
|
}
|
||||||
|
fn addCFlag(m: *Maker, flag: []const u8) !void {
|
||||||
|
try m.cflags.append(flag);
|
||||||
|
}
|
||||||
|
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
|
||||||
|
try m.cxxflags.append(flag);
|
||||||
|
}
|
||||||
|
fn addFlag(m: *Maker, flag: []const u8) !void {
|
||||||
|
try m.addCFlag(flag);
|
||||||
|
try m.addCxxFlag(flag);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn init(builder: *std.build.Builder) !Maker {
|
||||||
|
const target = builder.standardTargetOptions(.{});
|
||||||
|
const zig_version = @import("builtin").zig_version_string;
|
||||||
|
const commit_hash = try std.ChildProcess.exec(
|
||||||
|
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||||
|
);
|
||||||
|
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
||||||
|
\\int LLAMA_BUILD_NUMBER = {};
|
||||||
|
\\char const *LLAMA_COMMIT = "{s}";
|
||||||
|
\\char const *LLAMA_COMPILER = "Zig {s}";
|
||||||
|
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
||||||
|
\\
|
||||||
|
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
||||||
|
var m = Maker{
|
||||||
|
.builder = builder,
|
||||||
|
.target = target,
|
||||||
|
.optimize = builder.standardOptimizeOption(.{}),
|
||||||
|
.enable_lto = false,
|
||||||
|
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
||||||
|
.cflags = ArrayList([]const u8).init(builder.allocator),
|
||||||
|
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
||||||
|
.objs = ArrayList(*Compile).init(builder.allocator),
|
||||||
|
};
|
||||||
|
|
||||||
|
try m.addCFlag("-std=c11");
|
||||||
|
try m.addCxxFlag("-std=c++11");
|
||||||
|
try m.addProjectInclude(&.{});
|
||||||
|
try m.addProjectInclude(&.{"common"});
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
||||||
|
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||||
|
if (o.target.getAbi() != .msvc)
|
||||||
|
o.defineCMacro("_GNU_SOURCE", null);
|
||||||
|
|
||||||
|
if (std.mem.endsWith(u8, src, ".c")) {
|
||||||
|
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||||
|
o.linkLibC();
|
||||||
|
} else {
|
||||||
|
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||||
|
if (o.target.getAbi() == .msvc) {
|
||||||
|
o.linkLibC(); // need winsdk + crt
|
||||||
|
} else {
|
||||||
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||||
|
o.linkLibCpp();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||||
|
o.want_lto = m.enable_lto;
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
|
||||||
|
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||||
|
e.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||||
|
for (deps) |d| e.addObject(d);
|
||||||
|
for (m.objs.items) |o| e.addObject(o);
|
||||||
|
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
||||||
|
|
||||||
|
// https://github.com/ziglang/zig/issues/15448
|
||||||
|
if (e.target.getAbi() == .msvc) {
|
||||||
|
e.linkLibC(); // need winsdk + crt
|
||||||
|
} else {
|
||||||
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||||
|
e.linkLibCpp();
|
||||||
|
}
|
||||||
|
m.builder.installArtifact(e);
|
||||||
|
e.want_lto = m.enable_lto;
|
||||||
|
return e;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn build(b: *std.build.Builder) !void {
|
||||||
|
var make = try Maker.init(b);
|
||||||
|
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
||||||
|
|
||||||
|
const ggml = make.obj("ggml", "ggml.c");
|
||||||
|
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||||
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||||
|
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||||
|
const unicode = make.obj("unicode", "unicode.cpp");
|
||||||
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
|
const buildinfo = make.obj("common", "common/build-info.cpp");
|
||||||
|
const common = make.obj("common", "common/common.cpp");
|
||||||
|
const console = make.obj("console", "common/console.cpp");
|
||||||
|
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||||
|
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||||
|
const train = make.obj("train", "common/train.cpp");
|
||||||
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
|
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
||||||
|
|
||||||
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser });
|
||||||
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
|
||||||
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
|
||||||
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
|
||||||
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
|
||||||
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
|
||||||
|
|
||||||
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, clip, llava });
|
||||||
|
if (server.target.isWindows()) {
|
||||||
|
server.linkSystemLibrary("ws2_32");
|
||||||
|
}
|
||||||
|
}
|
616
ci/run.sh
616
ci/run.sh
|
@ -1,4 +1,4 @@
|
||||||
#!/bin/bash
|
#/bin/bash
|
||||||
#
|
#
|
||||||
# sample usage:
|
# sample usage:
|
||||||
#
|
#
|
||||||
|
@ -13,9 +13,6 @@
|
||||||
# # with SYCL support
|
# # with SYCL support
|
||||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
# # with VULKAN support
|
|
||||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
#
|
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
|
@ -39,11 +36,11 @@ SRC=`pwd`
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
|
@ -53,11 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
|
||||||
fi
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
|
@ -110,11 +103,8 @@ function gg_run_ctest_debug {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Check cmake, make and ctest are installed
|
|
||||||
gg_check_build_requirements
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
|
@ -141,11 +131,8 @@ function gg_run_ctest_release {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Check cmake, make and ctest are installed
|
|
||||||
gg_check_build_requirements
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
@ -166,64 +153,13 @@ function gg_sum_ctest_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
# test_scripts_debug
|
|
||||||
|
|
||||||
function gg_run_test_scripts_debug {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_test_scripts_debug {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs test scripts in debug mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
# test_scripts_release
|
|
||||||
|
|
||||||
function gg_run_test_scripts_release {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_test_scripts_release {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs test scripts in release mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_get_model {
|
function gg_get_model {
|
||||||
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
||||||
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||||
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
if [[ -s $gguf_3b ]]; then
|
||||||
if [[ -s $gguf_0 ]]; then
|
echo -n "$gguf_3b"
|
||||||
echo -n "$gguf_0"
|
elif [[ -s $gguf_7b ]]; then
|
||||||
elif [[ -s $gguf_1 ]]; then
|
echo -n "$gguf_7b"
|
||||||
echo -n "$gguf_1"
|
|
||||||
elif [[ -s $gguf_2 ]]; then
|
|
||||||
echo -n "$gguf_2"
|
|
||||||
else
|
else
|
||||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -272,34 +208,33 @@ function gg_sum_ctest_with_model_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_3b_v2
|
||||||
|
|
||||||
function gg_run_open_llama_7b_v2 {
|
function gg_run_open_llama_3b_v2 {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||||
|
|
||||||
path_models="../models-mnt/open-llama/7B-v2"
|
path_models="../models-mnt/open-llama/3B-v2"
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert.py ${path_models}
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -313,49 +248,46 @@ function gg_run_open_llama_7b_v2 {
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -384,148 +316,58 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
set +e
|
# lora
|
||||||
}
|
function compare_ppl {
|
||||||
|
|
||||||
function gg_sum_open_llama_7b_v2 {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
||||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
|
||||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
|
||||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
|
||||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
|
||||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# pythia_1.4b
|
|
||||||
|
|
||||||
function gg_run_pythia_1_4b {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
|
||||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
|
||||||
|
|
||||||
path_models="../models-mnt/pythia/1.4B"
|
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
|
||||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
|
||||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
|
||||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
|
||||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
|
||||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
|
||||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||||
return 20
|
return 20
|
||||||
fi
|
fi
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
path_lora="../models-mnt/open-llama/3B-v2/lora"
|
||||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
path_shakespeare="../models-mnt/shakespeare"
|
||||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||||
|
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||||
|
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||||
|
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||||
|
|
||||||
|
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||||
|
|
||||||
|
# f16
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||||
|
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||||
|
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0 + f16 lora-base
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||||
|
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_sum_pythia_1_4b {
|
function gg_sum_open_llama_3b_v2 {
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
gg_printf 'Pythia 1.4B:\n'
|
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
@ -538,33 +380,42 @@ function gg_sum_pythia_1_4b {
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||||
|
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
# pythia_2_8b
|
# open_llama_7b_v2
|
||||||
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
function gg_run_pythia_2_8b {
|
function gg_run_open_llama_7b_v2 {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
|
||||||
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
||||||
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
||||||
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
|
||||||
path_models="../models-mnt/pythia/2.8B"
|
path_models="../models-mnt/open-llama/7B-v2"
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert.py ${path_models}
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -580,47 +431,44 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -641,7 +489,7 @@ function gg_run_pythia_2_8b {
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
@ -649,16 +497,59 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
|
# lora
|
||||||
|
function compare_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
path_lora="../models-mnt/open-llama/7B-v2/lora"
|
||||||
|
path_shakespeare="../models-mnt/shakespeare"
|
||||||
|
|
||||||
|
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||||
|
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||||
|
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||||
|
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||||
|
|
||||||
|
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||||
|
|
||||||
|
# f16
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||||
|
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# currently not supported by the CUDA backend
|
||||||
|
# q8_0
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||||
|
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0 + f16 lora-base
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||||
|
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_sum_pythia_2_8b {
|
function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
gg_printf 'Pythia 2.8B:\n'
|
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
@ -671,6 +562,11 @@ function gg_sum_pythia_2_8b {
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||||
|
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
# bge-small
|
# bge-small
|
||||||
|
@ -679,7 +575,7 @@ function gg_run_embd_bge_small {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
||||||
|
@ -697,17 +593,17 @@ function gg_run_embd_bge_small {
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert-hf-to-gguf.py ${path_models}
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -721,92 +617,8 @@ function gg_sum_embd_bge_small {
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
# rerank_tiny
|
|
||||||
|
|
||||||
function gg_run_rerank_tiny {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
|
|
||||||
|
|
||||||
path_models="../models-mnt/rerank-tiny"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
|
|
||||||
# sample output
|
|
||||||
# rerank score 0: 0.029
|
|
||||||
# rerank score 1: 0.029
|
|
||||||
# rerank score 2: 0.135
|
|
||||||
|
|
||||||
# check that the score is in the range [$3, $4]
|
|
||||||
function check_score {
|
|
||||||
qnt="$1"
|
|
||||||
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$score"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_rerank_tiny {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Rerank Tiny (Jina):\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_check_build_requirements {
|
|
||||||
if ! command -v cmake &> /dev/null; then
|
|
||||||
gg_printf 'cmake not found, please install'
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v make &> /dev/null; then
|
|
||||||
gg_printf 'make not found, please install'
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v ctest &> /dev/null; then
|
|
||||||
gg_printf 'ctest not found, please install'
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
||||||
export LLAMA_LOG_PREFIX=1
|
|
||||||
export LLAMA_LOG_TIMESTAMPS=1
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||||
rm -rf ${SRC}/models-mnt
|
rm -rf ${SRC}/models-mnt
|
||||||
|
@ -815,10 +627,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
# Create a fresh python3 venv and enter it
|
# Create a fresh python3 venv and enter it
|
||||||
if ! python3 -m venv "$MNT/venv"; then
|
python3 -m venv "$MNT/venv"
|
||||||
echo "Error: Failed to create Python virtual environment at $MNT/venv."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "$MNT/venv/bin/activate"
|
source "$MNT/venv/bin/activate"
|
||||||
|
|
||||||
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
||||||
|
@ -832,19 +641,12 @@ test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
test $ret -eq 0 && gg_run embd_bge_small
|
test $ret -eq 0 && gg_run embd_bge_small
|
||||||
test $ret -eq 0 && gg_run rerank_tiny
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
|
||||||
test $ret -eq 0 && gg_run test_scripts_debug
|
|
||||||
test $ret -eq 0 && gg_run test_scripts_release
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run pythia_1_4b
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run pythia_2_8b
|
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||||
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
|
||||||
fi
|
fi
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||||
|
|
|
@ -79,22 +79,22 @@ endmacro()
|
||||||
# flags are for MSVC only!
|
# flags are for MSVC only!
|
||||||
check_sse("AVX" " ;/arch:AVX")
|
check_sse("AVX" " ;/arch:AVX")
|
||||||
if (NOT ${AVX_FOUND})
|
if (NOT ${AVX_FOUND})
|
||||||
set(GGML_AVX OFF)
|
set(LLAMA_AVX OFF)
|
||||||
else()
|
else()
|
||||||
set(GGML_AVX ON)
|
set(LLAMA_AVX ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_sse("AVX2" " ;/arch:AVX2")
|
check_sse("AVX2" " ;/arch:AVX2")
|
||||||
check_sse("FMA" " ;/arch:AVX2")
|
check_sse("FMA" " ;/arch:AVX2")
|
||||||
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||||
set(GGML_AVX2 OFF)
|
set(LLAMA_AVX2 OFF)
|
||||||
else()
|
else()
|
||||||
set(GGML_AVX2 ON)
|
set(LLAMA_AVX2 ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_sse("AVX512" " ;/arch:AVX512")
|
check_sse("AVX512" " ;/arch:AVX512")
|
||||||
if (NOT ${AVX512_FOUND})
|
if (NOT ${AVX512_FOUND})
|
||||||
set(GGML_AVX512 OFF)
|
set(LLAMA_AVX512 OFF)
|
||||||
else()
|
else()
|
||||||
set(GGML_AVX512 ON)
|
set(LLAMA_AVX512 ON)
|
||||||
endif()
|
endif()
|
|
@ -1,16 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Darwin )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
||||||
|
|
||||||
set( target arm64-apple-darwin-macho )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
|
@ -1,16 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Windows )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
||||||
|
|
||||||
set( target arm64-pc-windows-msvc )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
|
@ -1,6 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Windows )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
||||||
|
|
||||||
set( target arm64-pc-windows-msvc )
|
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
|
@ -1,33 +0,0 @@
|
||||||
function(llama_add_compile_flags)
|
|
||||||
if (LLAMA_FATAL_WARNINGS)
|
|
||||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|
||||||
list(APPEND C_FLAGS -Werror)
|
|
||||||
list(APPEND CXX_FLAGS -Werror)
|
|
||||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
|
||||||
add_compile_options(/WX)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS)
|
|
||||||
if (NOT MSVC)
|
|
||||||
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
|
||||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
|
||||||
|
|
||||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
|
||||||
|
|
||||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
|
||||||
|
|
||||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
|
||||||
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
|
||||||
|
|
||||||
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
|
||||||
else()
|
|
||||||
# todo : msvc
|
|
||||||
set(C_FLAGS "" PARENT_SCOPE)
|
|
||||||
set(CXX_FLAGS "" PARENT_SCOPE)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
endfunction()
|
|
|
@ -1,22 +0,0 @@
|
||||||
find_package(Git)
|
|
||||||
|
|
||||||
# the commit's SHA1
|
|
||||||
execute_process(COMMAND
|
|
||||||
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
|
||||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
||||||
OUTPUT_VARIABLE GIT_SHA1
|
|
||||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
|
|
||||||
# the date of the commit
|
|
||||||
execute_process(COMMAND
|
|
||||||
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
|
||||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
||||||
OUTPUT_VARIABLE GIT_DATE
|
|
||||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
|
|
||||||
# the subject of the commit
|
|
||||||
execute_process(COMMAND
|
|
||||||
"${GIT_EXECUTABLE}" log -1 --format=%s
|
|
||||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
||||||
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
|
||||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
|
@ -1,30 +0,0 @@
|
||||||
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
|
||||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
|
||||||
|
|
||||||
@PACKAGE_INIT@
|
|
||||||
|
|
||||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
|
||||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
|
||||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
|
||||||
|
|
||||||
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
|
|
||||||
|
|
||||||
find_library(llama_LIBRARY llama
|
|
||||||
REQUIRED
|
|
||||||
HINTS ${LLAMA_LIB_DIR}
|
|
||||||
NO_CMAKE_FIND_ROOT_PATH
|
|
||||||
)
|
|
||||||
|
|
||||||
add_library(llama UNKNOWN IMPORTED)
|
|
||||||
set_target_properties(llama
|
|
||||||
PROPERTIES
|
|
||||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
|
||||||
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
|
|
||||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
|
||||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
|
||||||
INTERFACE_COMPILE_FEATURES c_std_90
|
|
||||||
POSITION_INDEPENDENT_CODE ON)
|
|
||||||
|
|
||||||
check_required_components(Llama)
|
|
|
@ -1,10 +0,0 @@
|
||||||
prefix=@CMAKE_INSTALL_PREFIX@
|
|
||||||
exec_prefix=@CMAKE_INSTALL_PREFIX@
|
|
||||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
|
||||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
|
||||||
|
|
||||||
Name: llama
|
|
||||||
Description: Port of Facebook's LLaMA model in C/C++
|
|
||||||
Version: @LLAMA_INSTALL_VERSION@
|
|
||||||
Libs: -L${libdir} -lggml -lggml-base -lllama
|
|
||||||
Cflags: -I${includedir}
|
|
|
@ -1,11 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Windows )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
|
||||||
|
|
||||||
set( arch_c_flags "-march=native" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
|
|
||||||
|
|
14
codecov.yml
Normal file
14
codecov.yml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
comment: off
|
||||||
|
|
||||||
|
coverage:
|
||||||
|
status:
|
||||||
|
project:
|
||||||
|
default:
|
||||||
|
target: auto
|
||||||
|
threshold: 0
|
||||||
|
base: auto
|
||||||
|
patch:
|
||||||
|
default:
|
||||||
|
target: auto
|
||||||
|
threshold: 0
|
||||||
|
base: auto
|
|
@ -1,8 +1,5 @@
|
||||||
# common
|
# common
|
||||||
|
|
||||||
find_package(Threads REQUIRED)
|
|
||||||
|
|
||||||
llama_add_compile_flags()
|
|
||||||
|
|
||||||
# Build info header
|
# Build info header
|
||||||
#
|
#
|
||||||
|
@ -39,7 +36,7 @@ add_custom_command(
|
||||||
COMMENT "Generating build details from Git"
|
COMMENT "Generating build details from Git"
|
||||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
|
||||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
|
@ -50,31 +47,21 @@ if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
set(TARGET common)
|
set(TARGET common)
|
||||||
|
|
||||||
add_library(${TARGET} STATIC
|
add_library(${TARGET} STATIC
|
||||||
arg.cpp
|
|
||||||
arg.h
|
|
||||||
base64.hpp
|
base64.hpp
|
||||||
chat.cpp
|
|
||||||
chat.hpp
|
|
||||||
chat-template.hpp
|
|
||||||
common.cpp
|
|
||||||
common.h
|
common.h
|
||||||
console.cpp
|
common.cpp
|
||||||
console.h
|
|
||||||
json-schema-to-grammar.cpp
|
|
||||||
json.hpp
|
|
||||||
llguidance.cpp
|
|
||||||
log.cpp
|
|
||||||
log.h
|
|
||||||
minja.hpp
|
|
||||||
ngram-cache.cpp
|
|
||||||
ngram-cache.h
|
|
||||||
sampling.cpp
|
|
||||||
sampling.h
|
sampling.h
|
||||||
speculative.cpp
|
sampling.cpp
|
||||||
speculative.h
|
console.h
|
||||||
|
console.cpp
|
||||||
|
grammar-parser.h
|
||||||
|
grammar-parser.cpp
|
||||||
|
train.h
|
||||||
|
train.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
@ -86,39 +73,12 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||||
# Use curl to download model url
|
# Use curl to download model url
|
||||||
if (LLAMA_CURL)
|
if (LLAMA_CURL)
|
||||||
find_package(CURL REQUIRED)
|
find_package(CURL REQUIRED)
|
||||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
add_definitions(-DLLAMA_USE_CURL)
|
||||||
include_directories(${CURL_INCLUDE_DIRS})
|
include_directories(${CURL_INCLUDE_DIRS})
|
||||||
find_library(CURL_LIBRARY curl REQUIRED)
|
find_library(CURL_LIBRARY curl REQUIRED)
|
||||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (LLAMA_LLGUIDANCE)
|
|
||||||
include(ExternalProject)
|
|
||||||
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
|
|
||||||
set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
|
|
||||||
ExternalProject_Add(llguidance_ext
|
|
||||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
||||||
# v0.6.12:
|
|
||||||
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
|
|
||||||
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
||||||
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
||||||
BUILD_IN_SOURCE TRUE
|
|
||||||
CONFIGURE_COMMAND ""
|
|
||||||
BUILD_COMMAND cargo build --release
|
|
||||||
INSTALL_COMMAND ""
|
|
||||||
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
|
|
||||||
UPDATE_COMMAND ""
|
|
||||||
)
|
|
||||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
|
|
||||||
|
|
||||||
add_library(llguidance STATIC IMPORTED)
|
|
||||||
set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
|
|
||||||
add_dependencies(llguidance llguidance_ext)
|
|
||||||
|
|
||||||
target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
|
|
||||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
|
||||||
|
|
2370
common/arg.cpp
2370
common/arg.cpp
File diff suppressed because it is too large
Load diff
80
common/arg.h
80
common/arg.h
|
@ -1,80 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
//
|
|
||||||
// CLI argument parsing
|
|
||||||
//
|
|
||||||
|
|
||||||
struct common_arg {
|
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
|
||||||
std::set<enum llama_example> excludes = {};
|
|
||||||
std::vector<const char *> args;
|
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
|
||||||
const char * env = nullptr;
|
|
||||||
std::string help;
|
|
||||||
bool is_sparam = false; // is current arg a sampling param?
|
|
||||||
void (*handler_void) (common_params & params) = nullptr;
|
|
||||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
|
||||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
|
||||||
void (*handler_int) (common_params & params, int) = nullptr;
|
|
||||||
|
|
||||||
common_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(common_params & params, const std::string &)
|
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
|
||||||
|
|
||||||
common_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(common_params & params, int)
|
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
|
||||||
|
|
||||||
common_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(common_params & params)
|
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
|
||||||
|
|
||||||
// support 2 values for arg
|
|
||||||
common_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const char * value_hint_2,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(common_params & params, const std::string &, const std::string &)
|
|
||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
|
||||||
|
|
||||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
|
||||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
|
||||||
common_arg & set_env(const char * env);
|
|
||||||
common_arg & set_sparam();
|
|
||||||
bool in_example(enum llama_example ex);
|
|
||||||
bool is_exclude(enum llama_example ex);
|
|
||||||
bool get_value_from_env(std::string & output);
|
|
||||||
bool has_value_from_env();
|
|
||||||
std::string to_string();
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_params_context {
|
|
||||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
|
||||||
common_params & params;
|
|
||||||
std::vector<common_arg> options;
|
|
||||||
void(*print_usage)(int, char **) = nullptr;
|
|
||||||
common_params_context(common_params & params) : params(params) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
// parse input arguments from CLI
|
|
||||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
|
||||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
||||||
|
|
||||||
// function to be used by test-arg-parser
|
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
|
@ -1,529 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright 2024 Google LLC
|
|
||||||
|
|
||||||
Use of this source code is governed by an MIT-style
|
|
||||||
license that can be found in the LICENSE file or at
|
|
||||||
https://opensource.org/licenses/MIT.
|
|
||||||
*/
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "minja.hpp"
|
|
||||||
#include <json.hpp>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
|
||||||
|
|
||||||
namespace minja {
|
|
||||||
|
|
||||||
struct chat_template_caps {
|
|
||||||
bool supports_tools = false;
|
|
||||||
bool supports_tool_calls = false;
|
|
||||||
bool supports_tool_responses = false;
|
|
||||||
bool supports_system_role = false;
|
|
||||||
bool supports_parallel_tool_calls = false;
|
|
||||||
bool supports_tool_call_id = false;
|
|
||||||
// meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object.
|
|
||||||
// Most other templates (and OpenAI's API) expect the arguments object to be stringified.
|
|
||||||
bool requires_object_arguments = false;
|
|
||||||
// CohereForAI/c4ai-command-r-plus simple variant
|
|
||||||
bool requires_non_null_content = false;
|
|
||||||
// MiniMaxAI/MiniMax-Text-01 special
|
|
||||||
bool requires_typed_content = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct chat_template_inputs {
|
|
||||||
nlohmann::ordered_json messages;
|
|
||||||
nlohmann::ordered_json tools;
|
|
||||||
bool add_generation_prompt = true;
|
|
||||||
nlohmann::ordered_json extra_context;
|
|
||||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
||||||
};
|
|
||||||
|
|
||||||
struct chat_template_options {
|
|
||||||
bool apply_polyfills = true;
|
|
||||||
bool use_bos_token = true;
|
|
||||||
bool use_eos_token = true;
|
|
||||||
bool define_strftime_now = true;
|
|
||||||
|
|
||||||
bool polyfill_tools = true;
|
|
||||||
bool polyfill_tool_call_examples = true;
|
|
||||||
bool polyfill_tool_calls = true;
|
|
||||||
bool polyfill_tool_responses = true;
|
|
||||||
bool polyfill_system_role = true;
|
|
||||||
bool polyfill_object_arguments = true;
|
|
||||||
bool polyfill_typed_content = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
class chat_template {
|
|
||||||
|
|
||||||
private:
|
|
||||||
chat_template_caps caps_;
|
|
||||||
std::string source_;
|
|
||||||
std::string bos_token_;
|
|
||||||
std::string eos_token_;
|
|
||||||
std::shared_ptr<minja::TemplateNode> template_root_;
|
|
||||||
std::string tool_call_example_;
|
|
||||||
|
|
||||||
std::string try_raw_render(
|
|
||||||
const nlohmann::ordered_json & messages,
|
|
||||||
const nlohmann::ordered_json & tools,
|
|
||||||
bool add_generation_prompt,
|
|
||||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
chat_template_inputs inputs;
|
|
||||||
inputs.messages = messages;
|
|
||||||
inputs.tools = tools;
|
|
||||||
inputs.add_generation_prompt = add_generation_prompt;
|
|
||||||
inputs.extra_context = extra_context;
|
|
||||||
// Use fixed date for tests
|
|
||||||
inputs.now = std::chrono::system_clock::from_time_t(0);
|
|
||||||
|
|
||||||
chat_template_options opts;
|
|
||||||
opts.apply_polyfills = false;
|
|
||||||
|
|
||||||
auto prompt = apply(inputs, opts);
|
|
||||||
// fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
|
|
||||||
return prompt;
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
// fprintf(stderr, "try_raw_render error: %s\n", e.what());
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
|
|
||||||
: source_(source), bos_token_(bos_token), eos_token_(eos_token)
|
|
||||||
{
|
|
||||||
template_root_ = minja::Parser::parse(source_, {
|
|
||||||
/* .trim_blocks = */ true,
|
|
||||||
/* .lstrip_blocks = */ true,
|
|
||||||
/* .keep_trailing_newline = */ false,
|
|
||||||
});
|
|
||||||
|
|
||||||
auto contains = [](const std::string & haystack, const std::string & needle) {
|
|
||||||
return haystack.find(needle) != std::string::npos;
|
|
||||||
};
|
|
||||||
|
|
||||||
const std::string user_needle = "<User Needle>";
|
|
||||||
const std::string sys_needle = "<System Needle>";
|
|
||||||
const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}};
|
|
||||||
const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}};
|
|
||||||
|
|
||||||
caps_.requires_typed_content =
|
|
||||||
!contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle)
|
|
||||||
&& contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle);
|
|
||||||
|
|
||||||
const auto dummy_user_msg = caps_.requires_typed_content
|
|
||||||
? dummy_typed_user_msg
|
|
||||||
: dummy_str_user_msg;
|
|
||||||
const json needle_system_msg = {
|
|
||||||
{"role", "system"},
|
|
||||||
{"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)},
|
|
||||||
};
|
|
||||||
|
|
||||||
caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle);
|
|
||||||
|
|
||||||
auto out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg
|
|
||||||
}), json::array({
|
|
||||||
{
|
|
||||||
{"name", "some_tool"},
|
|
||||||
{"type", "function"},
|
|
||||||
{"function", {
|
|
||||||
{"name", "some_tool"},
|
|
||||||
{"description", "Some tool."},
|
|
||||||
{"parameters", {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"arg", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"description", "Some argument."},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
{"required", json::array({ "arg" })},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
},
|
|
||||||
}), false);
|
|
||||||
caps_.supports_tools = contains(out, "some_tool");
|
|
||||||
|
|
||||||
auto make_tool_calls_msg = [&](const json & tool_calls) {
|
|
||||||
return json {
|
|
||||||
{"role", "assistant"},
|
|
||||||
{"content", nullptr},
|
|
||||||
{"tool_calls", tool_calls},
|
|
||||||
};
|
|
||||||
};
|
|
||||||
auto make_tool_call = [](const std::string & tool_name, const json & arguments) {
|
|
||||||
return json {
|
|
||||||
{"id", "call_1___"},
|
|
||||||
{"type", "function"},
|
|
||||||
{"function", {
|
|
||||||
{"arguments", arguments},
|
|
||||||
{"name", tool_name},
|
|
||||||
}},
|
|
||||||
};
|
|
||||||
};
|
|
||||||
const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}};
|
|
||||||
|
|
||||||
// Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want.
|
|
||||||
out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg,
|
|
||||||
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
|
|
||||||
}), {}, false);
|
|
||||||
auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
|
|
||||||
out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg,
|
|
||||||
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
|
|
||||||
}), {}, false);
|
|
||||||
auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
|
|
||||||
|
|
||||||
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
|
|
||||||
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
|
|
||||||
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
|
|
||||||
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
|
|
||||||
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
|
|
||||||
|
|
||||||
if (caps_.supports_tool_calls) {
|
|
||||||
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
|
|
||||||
auto tc1 = make_tool_call("test_tool1", dummy_args);
|
|
||||||
auto tc2 = make_tool_call("test_tool2", dummy_args);
|
|
||||||
auto out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg,
|
|
||||||
make_tool_calls_msg(json::array({tc1, tc2})),
|
|
||||||
}), {}, false);
|
|
||||||
caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2");
|
|
||||||
|
|
||||||
out = try_raw_render(json::array({
|
|
||||||
dummy_user_msg,
|
|
||||||
make_tool_calls_msg(json::array({tc1})),
|
|
||||||
{
|
|
||||||
{"role", "tool"},
|
|
||||||
{"name", "test_tool1"},
|
|
||||||
{"content", "Some response!"},
|
|
||||||
{"tool_call_id", "call_911_"},
|
|
||||||
}
|
|
||||||
}), {}, false);
|
|
||||||
caps_.supports_tool_responses = contains(out, "Some response!");
|
|
||||||
caps_.supports_tool_call_id = contains(out, "call_911_");
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (!caps_.supports_tools) {
|
|
||||||
const json user_msg {
|
|
||||||
{"role", "user"},
|
|
||||||
{"content", "Hey"},
|
|
||||||
};
|
|
||||||
const json args {
|
|
||||||
{"arg1", "some_value"},
|
|
||||||
};
|
|
||||||
const json tool_call_msg {
|
|
||||||
{"role", "assistant"},
|
|
||||||
{"content", nullptr},
|
|
||||||
{"tool_calls", json::array({
|
|
||||||
{
|
|
||||||
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
|
|
||||||
{"id", "call_1___"},
|
|
||||||
{"type", "function"},
|
|
||||||
{"function", {
|
|
||||||
{"name", "tool_name"},
|
|
||||||
{"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))},
|
|
||||||
}},
|
|
||||||
},
|
|
||||||
})},
|
|
||||||
};
|
|
||||||
std::string prefix, full;
|
|
||||||
{
|
|
||||||
chat_template_inputs inputs;
|
|
||||||
inputs.messages = json::array({user_msg});
|
|
||||||
inputs.add_generation_prompt = true;
|
|
||||||
prefix = apply(inputs);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
chat_template_inputs inputs;
|
|
||||||
inputs.messages = json::array({user_msg, tool_call_msg});
|
|
||||||
inputs.add_generation_prompt = false;
|
|
||||||
full = apply(inputs);
|
|
||||||
}
|
|
||||||
auto eos_pos_last = full.rfind(eos_token_);
|
|
||||||
if (eos_pos_last == prefix.size() - eos_token_.size() ||
|
|
||||||
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
|
|
||||||
full = full.substr(0, eos_pos_last);
|
|
||||||
}
|
|
||||||
size_t common_prefix_length = 0;
|
|
||||||
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
|
|
||||||
if (prefix[i] != full[i]) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (prefix[i] == '<') {
|
|
||||||
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
|
|
||||||
// but it removes thinking tags for past messages.
|
|
||||||
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
common_prefix_length = i + 1;
|
|
||||||
}
|
|
||||||
auto example = full.substr(common_prefix_length);
|
|
||||||
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
|
|
||||||
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
|
|
||||||
} else {
|
|
||||||
tool_call_example_ = example;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string & source() const { return source_; }
|
|
||||||
const std::string & bos_token() const { return bos_token_; }
|
|
||||||
const std::string & eos_token() const { return eos_token_; }
|
|
||||||
const chat_template_caps & original_caps() const { return caps_; }
|
|
||||||
|
|
||||||
// Deprecated, please use the form with chat_template_inputs and chat_template_options
|
|
||||||
std::string apply(
|
|
||||||
const nlohmann::ordered_json & messages,
|
|
||||||
const nlohmann::ordered_json & tools,
|
|
||||||
bool add_generation_prompt,
|
|
||||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
|
|
||||||
bool apply_polyfills = true)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "[%s] Deprecated!\n", __func__);
|
|
||||||
chat_template_inputs inputs;
|
|
||||||
inputs.messages = messages;
|
|
||||||
inputs.tools = tools;
|
|
||||||
inputs.add_generation_prompt = add_generation_prompt;
|
|
||||||
inputs.extra_context = extra_context;
|
|
||||||
inputs.now = std::chrono::system_clock::now();
|
|
||||||
|
|
||||||
chat_template_options opts;
|
|
||||||
opts.apply_polyfills = apply_polyfills;
|
|
||||||
|
|
||||||
return apply(inputs, opts);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string apply(
|
|
||||||
const chat_template_inputs & inputs,
|
|
||||||
const chat_template_options & opts = chat_template_options()) const
|
|
||||||
{
|
|
||||||
json actual_messages;
|
|
||||||
|
|
||||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
|
||||||
auto has_tool_calls = false;
|
|
||||||
auto has_tool_responses = false;
|
|
||||||
auto has_string_content = false;
|
|
||||||
for (const auto & message : inputs.messages) {
|
|
||||||
if (message.contains("tool_calls") && !message["tool_calls"].is_null()) {
|
|
||||||
has_tool_calls = true;
|
|
||||||
}
|
|
||||||
if (message.contains("role") && message["role"] == "tool") {
|
|
||||||
has_tool_responses = true;
|
|
||||||
}
|
|
||||||
if (message.contains("content") && message["content"].is_string()) {
|
|
||||||
has_string_content = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
|
|
||||||
auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
|
|
||||||
auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
|
|
||||||
auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
|
|
||||||
auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
|
|
||||||
auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
|
|
||||||
auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content;
|
|
||||||
|
|
||||||
auto needs_polyfills = opts.apply_polyfills && (false
|
|
||||||
|| polyfill_system_role
|
|
||||||
|| polyfill_tools
|
|
||||||
|| polyfill_tool_calls
|
|
||||||
|| polyfill_tool_responses
|
|
||||||
|| polyfill_object_arguments
|
|
||||||
|| polyfill_typed_content
|
|
||||||
);
|
|
||||||
|
|
||||||
if (needs_polyfills) {
|
|
||||||
actual_messages = json::array();
|
|
||||||
|
|
||||||
auto add_message = [&](const json & msg) {
|
|
||||||
if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
|
|
||||||
actual_messages.push_back({
|
|
||||||
{"role", msg.at("role")},
|
|
||||||
{"content", {{
|
|
||||||
{"type", "text"},
|
|
||||||
{"text", msg.at("content")},
|
|
||||||
}}},
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
actual_messages.push_back(msg);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string pending_system;
|
|
||||||
auto flush_sys = [&]() {
|
|
||||||
if (!pending_system.empty()) {
|
|
||||||
add_message({
|
|
||||||
{"role", "user"},
|
|
||||||
{"content", pending_system},
|
|
||||||
});
|
|
||||||
pending_system.clear();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
json adjusted_messages;
|
|
||||||
if (polyfill_tools) {
|
|
||||||
adjusted_messages = add_system(inputs.messages,
|
|
||||||
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
|
|
||||||
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
|
|
||||||
} else {
|
|
||||||
adjusted_messages = inputs.messages;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto & message_ : adjusted_messages) {
|
|
||||||
auto message = message_;
|
|
||||||
if (!message.contains("role") || !message.contains("content")) {
|
|
||||||
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
|
|
||||||
}
|
|
||||||
std::string role = message.at("role");
|
|
||||||
|
|
||||||
if (message.contains("tool_calls")) {
|
|
||||||
if (polyfill_object_arguments || polyfill_tool_calls) {
|
|
||||||
for (auto & tool_call : message.at("tool_calls")) {
|
|
||||||
if (tool_call["type"] == "function") {
|
|
||||||
auto & function = tool_call.at("function");
|
|
||||||
auto & arguments = function.at("arguments");
|
|
||||||
if (arguments.is_string()) {
|
|
||||||
try {
|
|
||||||
arguments = json::parse(arguments.get<std::string>());
|
|
||||||
} catch (const std::exception & ecvt) {
|
|
||||||
fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (polyfill_tool_calls) {
|
|
||||||
auto content = message.at("content");
|
|
||||||
auto tool_calls = json::array();
|
|
||||||
for (const auto & tool_call : message.at("tool_calls")) {
|
|
||||||
if (tool_call.at("type") != "function") {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const auto & function = tool_call.at("function");
|
|
||||||
auto tc = json {
|
|
||||||
{"name", function.at("name")},
|
|
||||||
{"arguments", function.at("arguments")},
|
|
||||||
};
|
|
||||||
if (tool_call.contains("id")) {
|
|
||||||
tc["id"] = tool_call["id"];
|
|
||||||
}
|
|
||||||
tool_calls.push_back(tc);
|
|
||||||
}
|
|
||||||
auto obj = json {
|
|
||||||
{"tool_calls", tool_calls},
|
|
||||||
};
|
|
||||||
if (!content.is_null() && content != "") {
|
|
||||||
obj["content"] = content;
|
|
||||||
}
|
|
||||||
message["content"] = obj.dump(2);
|
|
||||||
message.erase("tool_calls");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (polyfill_tool_responses && role == "tool") {
|
|
||||||
message["role"] = "user";
|
|
||||||
auto obj = json {
|
|
||||||
{"tool_response", {
|
|
||||||
{"content", message.at("content")},
|
|
||||||
}},
|
|
||||||
};
|
|
||||||
if (message.contains("name")) {
|
|
||||||
obj["tool_response"]["name"] = message.at("name");
|
|
||||||
}
|
|
||||||
if (message.contains("tool_call_id")) {
|
|
||||||
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
|
|
||||||
}
|
|
||||||
message["content"] = obj.dump(2);
|
|
||||||
message.erase("name");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!message["content"].is_null() && polyfill_system_role) {
|
|
||||||
std::string content = message.at("content");
|
|
||||||
if (role == "system") {
|
|
||||||
if (!pending_system.empty()) pending_system += "\n";
|
|
||||||
pending_system += content;
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
if (role == "user") {
|
|
||||||
if (!pending_system.empty()) {
|
|
||||||
message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
|
|
||||||
pending_system.clear();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
flush_sys();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
add_message(message);
|
|
||||||
}
|
|
||||||
flush_sys();
|
|
||||||
} else {
|
|
||||||
actual_messages = inputs.messages;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto context = minja::Context::make(json({
|
|
||||||
{"messages", actual_messages},
|
|
||||||
{"add_generation_prompt", inputs.add_generation_prompt},
|
|
||||||
}));
|
|
||||||
context->set("bos_token", opts.use_bos_token ? bos_token_ : "");
|
|
||||||
context->set("eos_token", opts.use_eos_token ? eos_token_ : "");
|
|
||||||
if (opts.define_strftime_now) {
|
|
||||||
auto now = inputs.now;
|
|
||||||
context->set("strftime_now", Value::callable([now](const std::shared_ptr<minja::Context> &, minja::ArgumentsValue & args) {
|
|
||||||
args.expectArgs("strftime_now", {1, 1}, {0, 0});
|
|
||||||
auto format = args.args[0].get<std::string>();
|
|
||||||
|
|
||||||
auto time = std::chrono::system_clock::to_time_t(now);
|
|
||||||
auto local_time = *std::localtime(&time);
|
|
||||||
std::ostringstream ss;
|
|
||||||
ss << std::put_time(&local_time, format.c_str());
|
|
||||||
return ss.str();
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
if (!inputs.tools.is_null()) {
|
|
||||||
context->set("tools", minja::Value(inputs.tools));
|
|
||||||
}
|
|
||||||
if (!inputs.extra_context.is_null()) {
|
|
||||||
for (auto & kv : inputs.extra_context.items()) {
|
|
||||||
context->set(kv.key(), minja::Value(kv.value()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto ret = template_root_->render(context);
|
|
||||||
// fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str());
|
|
||||||
// fprintf(stderr, "apply: %s\n\n", ret.c_str());
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
|
|
||||||
json messages_with_system = messages;
|
|
||||||
|
|
||||||
if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
|
|
||||||
std::string existing_system = messages_with_system.at(0).at("content");
|
|
||||||
messages_with_system[0] = json {
|
|
||||||
{"role", "system"},
|
|
||||||
{"content", existing_system + "\n\n" + system_prompt},
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
messages_with_system.insert(messages_with_system.begin(), json {
|
|
||||||
{"role", "system"},
|
|
||||||
{"content", system_prompt},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
return messages_with_system;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace minja
|
|
966
common/chat.cpp
966
common/chat.cpp
|
@ -1,966 +0,0 @@
|
||||||
#include "chat.hpp"
|
|
||||||
#include "chat-template.hpp"
|
|
||||||
#include "json-schema-to-grammar.h"
|
|
||||||
#include "log.h"
|
|
||||||
#include "minja.hpp"
|
|
||||||
|
|
||||||
std::string common_chat_format_name(common_chat_format format) {
|
|
||||||
switch (format) {
|
|
||||||
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
|
|
||||||
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
|
|
||||||
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
|
|
||||||
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
|
|
||||||
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
|
|
||||||
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
|
|
||||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
|
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
||||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
||||||
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
||||||
default:
|
|
||||||
throw std::runtime_error("Unknown chat format");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const common_grammar_options grammar_options {
|
|
||||||
/* .dotall = */ false,
|
|
||||||
/* .compact_spaces = */ false,
|
|
||||||
// /* .compact_spaces = */ true,
|
|
||||||
};
|
|
||||||
|
|
||||||
static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
|
|
||||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
|
||||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
|
||||||
std::size_t position;
|
|
||||||
bool found_error;
|
|
||||||
|
|
||||||
json_error_locator() : position(0), found_error(false) {}
|
|
||||||
|
|
||||||
bool parse_error(std::size_t position, const std::string &, const json::exception &) override {
|
|
||||||
this->position = position - 1;
|
|
||||||
this->found_error = true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
bool null() override { return true; }
|
|
||||||
bool boolean(bool) override { return true; }
|
|
||||||
bool number_integer(number_integer_t) override { return true; }
|
|
||||||
bool number_unsigned(number_unsigned_t) override { return true; }
|
|
||||||
bool number_float(number_float_t, const string_t &) override { return true; }
|
|
||||||
bool string(string_t &) override { return true; }
|
|
||||||
bool binary(binary_t &) override { return true; }
|
|
||||||
bool start_object(std::size_t) override { return true; }
|
|
||||||
bool key(string_t &) override { return true; }
|
|
||||||
bool end_object() override { return true; }
|
|
||||||
bool start_array(std::size_t) override { return true; }
|
|
||||||
bool end_array() override { return true; }
|
|
||||||
};
|
|
||||||
json_error_locator err_loc;
|
|
||||||
json::sax_parse(it, end, &err_loc);
|
|
||||||
|
|
||||||
std::string::const_iterator temptative_end;
|
|
||||||
if (err_loc.found_error) {
|
|
||||||
temptative_end = it + err_loc.position;
|
|
||||||
} else {
|
|
||||||
temptative_end = end;
|
|
||||||
}
|
|
||||||
std::string json_sub {it, temptative_end};
|
|
||||||
try {
|
|
||||||
out = json::parse(json_sub);
|
|
||||||
it = temptative_end;
|
|
||||||
return true;
|
|
||||||
} catch (const std::exception &) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
|
|
||||||
* Aggregates the prefix, suffix and in-between text into the content.
|
|
||||||
*/
|
|
||||||
static common_chat_msg parse_json_tool_calls(
|
|
||||||
const std::string& input,
|
|
||||||
const std::optional<std::regex> & trigger_opt,
|
|
||||||
const std::regex & function_regex,
|
|
||||||
const std::regex & close_regex) {
|
|
||||||
std::smatch match;
|
|
||||||
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
|
|
||||||
|
|
||||||
auto end = input.end();
|
|
||||||
auto it = input.begin();
|
|
||||||
|
|
||||||
if (trigger_opt) {
|
|
||||||
if (!std::regex_search(it, end, match, *trigger_opt)) {
|
|
||||||
result.content = input;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
result.content = match.prefix().str();
|
|
||||||
it = match.suffix().first;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (it != end) {
|
|
||||||
std::sregex_iterator rend;
|
|
||||||
std::sregex_iterator rit(it, end, function_regex);
|
|
||||||
if (rit == rend) {
|
|
||||||
fprintf(stderr, "No more tool calls found\n");
|
|
||||||
result.content += std::string(it, end);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
auto name = rit->str(1);
|
|
||||||
result.content += std::string(it, rit->prefix().second);
|
|
||||||
it = rit->suffix().first;
|
|
||||||
|
|
||||||
json arguments;
|
|
||||||
if (!parse_json(it, end, arguments)) {
|
|
||||||
throw std::runtime_error("Failed to parse json tool call arguments");
|
|
||||||
}
|
|
||||||
if (!std::regex_search(it, end, match, close_regex)) {
|
|
||||||
throw std::runtime_error("Malformed input, missing closing pattern");
|
|
||||||
}
|
|
||||||
it = match.suffix().first;
|
|
||||||
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
|
|
||||||
auto content_end = input.find(prefix);
|
|
||||||
size_t tc_start = std::string::npos;
|
|
||||||
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
const auto process_tool_calls = [&](const json & tool_calls) {
|
|
||||||
for (const auto & tool_call : tool_calls) {
|
|
||||||
const auto & arguments = tool_call["arguments"];
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
tool_call["name"],
|
|
||||||
arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
|
||||||
tool_call.contains("id") ? tool_call["id"] : "",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if (content_end == std::string::npos) {
|
|
||||||
result.content = input;
|
|
||||||
} else {
|
|
||||||
tc_start = content_end + prefix.size() - rstrip_prefix;
|
|
||||||
result.content = input.substr(0, content_end);
|
|
||||||
auto tool_calls = json::parse(input.substr(tc_start));
|
|
||||||
process_tool_calls(tool_calls);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
|
|
||||||
for (const auto & tool : tools) {
|
|
||||||
if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) {
|
|
||||||
LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
fn(tool);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string apply(
|
|
||||||
const common_chat_template & tmpl,
|
|
||||||
const nlohmann::ordered_json & messages,
|
|
||||||
const nlohmann::ordered_json & tools,
|
|
||||||
bool add_generation_prompt,
|
|
||||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
|
|
||||||
{
|
|
||||||
minja::chat_template_inputs tmpl_inputs;
|
|
||||||
tmpl_inputs.messages = messages;
|
|
||||||
tmpl_inputs.tools = tools;
|
|
||||||
tmpl_inputs.add_generation_prompt = add_generation_prompt;
|
|
||||||
tmpl_inputs.extra_context = extra_context;
|
|
||||||
// TODO: add flag to control date/time, if only for testing purposes.
|
|
||||||
// tmpl_inputs.now = std::chrono::system_clock::now();
|
|
||||||
|
|
||||||
minja::chat_template_options tmpl_opts;
|
|
||||||
tmpl_opts.use_bos_token = false;
|
|
||||||
tmpl_opts.use_eos_token = false;
|
|
||||||
|
|
||||||
return tmpl.apply(tmpl_inputs, tmpl_opts);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
|
|
||||||
auto tool_call_schemas = json::array();
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
auto tool_schema = json {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"name", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"const", function["name"]},
|
|
||||||
}},
|
|
||||||
{"arguments", function["parameters"]},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"name", "arguments"})},
|
|
||||||
};
|
|
||||||
if (function.contains("description")) {
|
|
||||||
tool_schema["description"] = function["description"];
|
|
||||||
}
|
|
||||||
if (inputs.parallel_tool_calls) {
|
|
||||||
tool_schema["properties"]["id"] = {
|
|
||||||
{"type", "string"},
|
|
||||||
{"minLength", 4},
|
|
||||||
};
|
|
||||||
tool_schema["required"].push_back("id");
|
|
||||||
}
|
|
||||||
tool_call_schemas.emplace_back(tool_schema);
|
|
||||||
});
|
|
||||||
const auto tool_call =
|
|
||||||
inputs.parallel_tool_calls
|
|
||||||
? json {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"tool_calls", {
|
|
||||||
{"type", "array"},
|
|
||||||
{"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
|
|
||||||
{"anyOf", tool_call_schemas},
|
|
||||||
}},
|
|
||||||
{"minItems", 1},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"tool_calls"})},
|
|
||||||
}
|
|
||||||
: json {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
|
|
||||||
{"anyOf", tool_call_schemas},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"tool_call"})},
|
|
||||||
};
|
|
||||||
const auto schema =
|
|
||||||
inputs.tool_choice != "required"
|
|
||||||
? json {
|
|
||||||
{"anyOf", json::array({
|
|
||||||
tool_call,
|
|
||||||
{
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"response", inputs.json_schema.is_null()
|
|
||||||
? json {{"type", "string"}}
|
|
||||||
: inputs.json_schema
|
|
||||||
},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"response"})},
|
|
||||||
},
|
|
||||||
})}
|
|
||||||
}
|
|
||||||
: tool_call;
|
|
||||||
|
|
||||||
data.grammar_lazy = false;
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
builder.add_schema("root", schema);
|
|
||||||
}, grammar_options);
|
|
||||||
|
|
||||||
auto tweaked_messages = common_chat_template::add_system(
|
|
||||||
inputs.messages,
|
|
||||||
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
|
||||||
|
|
||||||
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_generic(const std::string & input) {
|
|
||||||
json data = json::parse(input);
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
if (data.contains("tool_calls")) {
|
|
||||||
for (const auto & tool_call : data["tool_calls"]) {
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
tool_call["name"],
|
|
||||||
tool_call["arguments"].dump(),
|
|
||||||
tool_call.contains("id") ? tool_call["id"] : "",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else if (data.contains("tool_call")) {
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
data["tool_call"]["name"],
|
|
||||||
data["tool_call"]["arguments"].dump(),
|
|
||||||
/* id= */ "",
|
|
||||||
});
|
|
||||||
} else if (data.contains("response")) {
|
|
||||||
const auto & response = data["response"];
|
|
||||||
result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
auto schemas = json::array();
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
schemas.push_back({
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
// Important note: the model is probably trained to take a JSON stringified arguments value.
|
|
||||||
// It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object.
|
|
||||||
{"name", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"const", function["name"]},
|
|
||||||
}},
|
|
||||||
{"arguments", function["parameters"]},
|
|
||||||
{"id", {
|
|
||||||
{"type", "string"},
|
|
||||||
// Nemo's template expects a 9-character alphanumeric ID.
|
|
||||||
{"pattern", "^[a-zA-Z0-9]{9}$"},
|
|
||||||
}},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"name", "arguments", "id"})},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
auto schema = json {
|
|
||||||
{"type", "array"},
|
|
||||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
||||||
{"minItems", 1},
|
|
||||||
};
|
|
||||||
if (!inputs.parallel_tool_calls) {
|
|
||||||
schema["maxItems"] = 1;
|
|
||||||
}
|
|
||||||
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
|
|
||||||
}, grammar_options);
|
|
||||||
data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) {
|
|
||||||
return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]");
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
auto schemas = json::array();
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
schemas.push_back({
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"tool_call_id", {
|
|
||||||
{"type", "string"},
|
|
||||||
// Command-R's template expects an integer string.
|
|
||||||
{"pattern", "^[0-9]{1,10}$"},
|
|
||||||
}},
|
|
||||||
{"tool_name", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"const", function["name"]},
|
|
||||||
}},
|
|
||||||
{"parameters", function["parameters"]},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"tool_call_id", "tool_name", "parameters"})},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
auto schema = json {
|
|
||||||
{"type", "array"},
|
|
||||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
||||||
{"minItems", 1},
|
|
||||||
};
|
|
||||||
if (!inputs.parallel_tool_calls) {
|
|
||||||
schema["maxItems"] = 1;
|
|
||||||
}
|
|
||||||
builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
|
|
||||||
}, grammar_options);
|
|
||||||
data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
|
|
||||||
data.preserved_tokens = {
|
|
||||||
"<|START_RESPONSE|>",
|
|
||||||
"<|END_RESPONSE|>",
|
|
||||||
"<|START_THINKING|>",
|
|
||||||
"<|END_THINKING|>",
|
|
||||||
"<|END_ACTION|>",
|
|
||||||
};
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
|
|
||||||
static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
|
|
||||||
static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
|
|
||||||
std::smatch match;
|
|
||||||
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
if (std::regex_match(input, match, response_regex)) {
|
|
||||||
result.content = match[1].str();
|
|
||||||
} else if (std::regex_match(input, match, thought_action_regex)) {
|
|
||||||
result.tool_plan = match[1].str();
|
|
||||||
auto actions_str = match[2].str();
|
|
||||||
auto actions = json::parse(actions_str);
|
|
||||||
for (const auto & action : actions) {
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
/* .name = */ action["tool_name"],
|
|
||||||
/* .arguments = */ action["parameters"].dump(),
|
|
||||||
/* .id = */ action["tool_call_id"],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOG_ERR("Failed to parse command_r output");
|
|
||||||
result.content = input;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
|
|
||||||
if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
|
|
||||||
throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties");
|
|
||||||
}
|
|
||||||
const auto & parameters_properties = parameters.at("properties");
|
|
||||||
const auto & parameters_required = parameters.at("required");
|
|
||||||
for (const auto & prop : expected_properties) {
|
|
||||||
if (!parameters_properties.contains(prop)) {
|
|
||||||
throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop);
|
|
||||||
}
|
|
||||||
if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) {
|
|
||||||
throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (parameters_properties.size() != expected_properties.size()) {
|
|
||||||
throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", "));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct common_chat_inputs & inputs, bool allow_python_tag_builtin_tools) {
|
|
||||||
auto builtin_tools = json::array();
|
|
||||||
common_chat_params data;
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> tool_rules;
|
|
||||||
|
|
||||||
auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
|
|
||||||
if (name == "wolfram_alpha") {
|
|
||||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
|
|
||||||
expect_tool_parameters(name, parameters, {"query"});
|
|
||||||
} else if (name == "web_search" || name == "brave_search") {
|
|
||||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
|
|
||||||
expect_tool_parameters(name, parameters, {"query"});
|
|
||||||
} else if (name == "python" || name == "code_interpreter") {
|
|
||||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
|
|
||||||
expect_tool_parameters(name, parameters, {"code"});
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> kvs;
|
|
||||||
for (const auto & [key, value] : parameters.at("properties").items()) {
|
|
||||||
kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value));
|
|
||||||
}
|
|
||||||
|
|
||||||
tool_rules.push_back(
|
|
||||||
builder.add_rule(
|
|
||||||
name + "-call",
|
|
||||||
"\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
|
|
||||||
builtin_tools.push_back(name);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
auto parameters = function["parameters"];
|
|
||||||
builder.resolve_refs(parameters);
|
|
||||||
|
|
||||||
// https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
|
|
||||||
if (allow_python_tag_builtin_tools) {
|
|
||||||
handle_builtin_tool(name, parameters);
|
|
||||||
}
|
|
||||||
tool_rules.push_back(
|
|
||||||
builder.add_rule(
|
|
||||||
name + "-call",
|
|
||||||
"\"{\" space "
|
|
||||||
"( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
|
|
||||||
"\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
|
|
||||||
builder.add_schema(name + "-args", parameters) +
|
|
||||||
" \"}\""));
|
|
||||||
data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
|
|
||||||
});
|
|
||||||
data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
|
||||||
if (!builtin_tools.empty()) {
|
|
||||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
|
||||||
}
|
|
||||||
builder.add_rule("root", string_join(tool_rules, " | "));
|
|
||||||
}, grammar_options);
|
|
||||||
data.additional_stops.push_back("<|eom_id|>");
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
|
|
||||||
{"tools_in_user_message", false},
|
|
||||||
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
|
||||||
});
|
|
||||||
data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
|
|
||||||
? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
|
|
||||||
: COMMON_CHAT_FORMAT_LLAMA_3_X;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
|
|
||||||
// TODO: tighten & simplify the parser, don't accept leading text context.
|
|
||||||
static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
|
|
||||||
static std::regex close_regex("\\}");
|
|
||||||
static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
|
|
||||||
|
|
||||||
if (with_builtin_tools) {
|
|
||||||
std::smatch match;
|
|
||||||
if (std::regex_match(input, match, builtin_call_regex)) {
|
|
||||||
auto name = match[1].str();
|
|
||||||
auto raw_args = match[2].str();
|
|
||||||
|
|
||||||
// TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
|
|
||||||
auto it_eq = raw_args.find('=');
|
|
||||||
auto arg_name = raw_args.substr(0, it_eq);
|
|
||||||
auto arg_value_str = raw_args.substr(it_eq + 1);
|
|
||||||
auto arg_value = json::parse(arg_value_str);
|
|
||||||
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ match.prefix().str(),
|
|
||||||
/* .tool_calls = */ {
|
|
||||||
{
|
|
||||||
/* .name = */ match[1],
|
|
||||||
/* .arguments = */ (json {
|
|
||||||
{arg_name, arg_value},
|
|
||||||
}).dump(),
|
|
||||||
/* .id = */ "",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> tool_rules;
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
auto parameters = function["parameters"];
|
|
||||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
|
||||||
tool_rules.push_back(builder.add_rule(name + "-call",
|
|
||||||
"\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\""));
|
|
||||||
});
|
|
||||||
data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false});
|
|
||||||
data.preserved_tokens = {
|
|
||||||
"<|tool▁sep|>",
|
|
||||||
"<|tool▁call▁end|>",
|
|
||||||
};
|
|
||||||
builder.add_rule("root", "\"<|tool▁calls▁begin|>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space");
|
|
||||||
}, grammar_options);
|
|
||||||
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.prompt = prompt;
|
|
||||||
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) {
|
|
||||||
static std::regex trigger_regex("<|tool▁calls▁begin|>");
|
|
||||||
static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
|
|
||||||
static std::regex close_regex("```<|tool▁call▁end|>");
|
|
||||||
return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
fprintf(stderr, "%s\n", __func__);
|
|
||||||
common_chat_params data;
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
|
|
||||||
{"datetime", "Jan 29 2025 13:00:00 GMT"},
|
|
||||||
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
|
||||||
});
|
|
||||||
if (!inputs.tools.is_null() && !inputs.tools.empty()) {
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
auto schemas = json::array();
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
schemas.push_back({
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", {
|
|
||||||
{"name", {
|
|
||||||
{"type", "string"},
|
|
||||||
{"const", function["name"]},
|
|
||||||
}},
|
|
||||||
{"arguments", function["parameters"]},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"name", "arguments", "id"})},
|
|
||||||
});
|
|
||||||
});
|
|
||||||
auto schema = json {
|
|
||||||
{"type", "array"},
|
|
||||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
||||||
{"minItems", 1},
|
|
||||||
};
|
|
||||||
if (!inputs.parallel_tool_calls) {
|
|
||||||
schema["maxItems"] = 1;
|
|
||||||
}
|
|
||||||
builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
|
|
||||||
}, grammar_options);
|
|
||||||
data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
|
|
||||||
data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
|
|
||||||
} else {
|
|
||||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
||||||
}
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) {
|
|
||||||
return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
// >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
|
|
||||||
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
|
|
||||||
common_chat_params data;
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
|
|
||||||
if (!inputs.tools.is_null() && !inputs.tools.empty()) {
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> first_tool_rules;
|
|
||||||
std::vector<std::string> subsequent_tool_rules;
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
auto parameters = function["parameters"];
|
|
||||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
|
||||||
first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
|
|
||||||
subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
|
|
||||||
data.grammar_triggers.push_back({name, /* .at_start = */ true});
|
|
||||||
data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
|
|
||||||
});
|
|
||||||
auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
|
|
||||||
if (inputs.parallel_tool_calls) {
|
|
||||||
auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
|
|
||||||
builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*");
|
|
||||||
} else {
|
|
||||||
builder.add_rule("root", first_rule);
|
|
||||||
}
|
|
||||||
|
|
||||||
}, grammar_options);
|
|
||||||
}
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
|
|
||||||
auto expected_it = expected.begin();
|
|
||||||
auto tmp_it = it;
|
|
||||||
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
|
|
||||||
++tmp_it;
|
|
||||||
++expected_it;
|
|
||||||
}
|
|
||||||
if (expected_it == expected.end()) {
|
|
||||||
it = tmp_it;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
|
|
||||||
static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
|
|
||||||
static std::regex close_regex(R"($|(?=>>>))");
|
|
||||||
|
|
||||||
std::string content;
|
|
||||||
auto it = input.begin();
|
|
||||||
const auto end = input.end();
|
|
||||||
|
|
||||||
if (consume(it, end, "all\n")) {
|
|
||||||
std::smatch match;
|
|
||||||
if (std::regex_search(it, end, match, function_regex)) {
|
|
||||||
auto fun_it = match.prefix().second;
|
|
||||||
content = std::string(it, fun_it);
|
|
||||||
it = fun_it;
|
|
||||||
} else {
|
|
||||||
common_chat_msg res;
|
|
||||||
res.role = "assistant";
|
|
||||||
res.content = std::string(it, end);
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// TODO: tighten & simplify.
|
|
||||||
try {
|
|
||||||
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
|
|
||||||
res.content = content + res.content;
|
|
||||||
return res;
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
|
|
||||||
common_chat_msg res;
|
|
||||||
res.role = "assistant";
|
|
||||||
res.content = input;
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
// https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
|
|
||||||
common_chat_params data;
|
|
||||||
json tools = inputs.tools.is_null() ? inputs.tools : json::array();
|
|
||||||
std::string python_code_argument_name;
|
|
||||||
auto has_raw_python = false;
|
|
||||||
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> tool_rules;
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
const auto & parameters = function["parameters"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
if (name == "python" || name == "ipython") {
|
|
||||||
if (!parameters.contains("type")) {
|
|
||||||
throw std::runtime_error("Missing type in python tool");
|
|
||||||
}
|
|
||||||
has_raw_python = true;
|
|
||||||
auto type = parameters.at("type");
|
|
||||||
if (type == "object") {
|
|
||||||
auto properties = parameters.at("properties");
|
|
||||||
for (auto it = properties.begin(); it != properties.end(); ++it) {
|
|
||||||
if (it.value().at("type") == "string") {
|
|
||||||
if (!python_code_argument_name.empty()) {
|
|
||||||
throw std::runtime_error("Multiple string arguments found in python tool");
|
|
||||||
}
|
|
||||||
python_code_argument_name = it.key();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (python_code_argument_name.empty()) {
|
|
||||||
throw std::runtime_error("No string argument found in python tool");
|
|
||||||
}
|
|
||||||
} else if (type != "string") {
|
|
||||||
throw std::runtime_error("Invalid type in python tool: " + type.dump());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
|
|
||||||
});
|
|
||||||
if (has_raw_python) {
|
|
||||||
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
|
|
||||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
|
||||||
}
|
|
||||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
|
|
||||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
|
||||||
data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
|
|
||||||
}, grammar_options);
|
|
||||||
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
// TODO: if (has_raw_python)
|
|
||||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
|
|
||||||
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
|
|
||||||
static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
|
|
||||||
std::smatch match;
|
|
||||||
if (std::regex_search(input, match, python_tag_regex)) {
|
|
||||||
auto code = match[1].str();
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ match.prefix().str(),
|
|
||||||
/* .tool_calls = */ {
|
|
||||||
{
|
|
||||||
/* .name = */ "python",
|
|
||||||
/* .arguments = */ (json {{"code", code}}).dump(),
|
|
||||||
/* .id = */ "",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
static std::regex function_regex(R"(<function=(\w+)>)");
|
|
||||||
static std::regex close_regex(R"(</function>)");
|
|
||||||
// TODO: tighten & simplify.
|
|
||||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
|
|
||||||
data.grammar_lazy = inputs.tool_choice != "required";
|
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
||||||
std::vector<std::string> tool_rules;
|
|
||||||
foreach_function(inputs.tools, [&](const json & tool) {
|
|
||||||
const auto & function = tool["function"];
|
|
||||||
std::string name = function["name"];
|
|
||||||
auto parameters = function["parameters"];
|
|
||||||
builder.resolve_refs(parameters);
|
|
||||||
tool_rules.push_back(builder.add_schema(name + "-call", {
|
|
||||||
{"type", "object"},
|
|
||||||
{"properties", json {
|
|
||||||
{"name", json {{"const", name}}},
|
|
||||||
{"arguments", parameters},
|
|
||||||
}},
|
|
||||||
{"required", json::array({"name", "arguments"})},
|
|
||||||
}));
|
|
||||||
});
|
|
||||||
auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
|
|
||||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
|
||||||
data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
|
|
||||||
data.preserved_tokens = { "</tool_call>" };
|
|
||||||
}, grammar_options);
|
|
||||||
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
|
|
||||||
try {
|
|
||||||
std::regex start_pattern(R"([\n\s]*<tool_call>)");
|
|
||||||
std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
|
|
||||||
std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
|
|
||||||
|
|
||||||
auto end = input.end();
|
|
||||||
std::sregex_iterator rend;
|
|
||||||
std::sregex_iterator rit(input.begin(), end, start_pattern);
|
|
||||||
if (rit == rend) {
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ input,
|
|
||||||
/* .tool_calls = */ {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
common_chat_msg result;
|
|
||||||
result.role = "assistant";
|
|
||||||
result.content = rit->prefix();
|
|
||||||
|
|
||||||
auto it = rit->suffix().first;
|
|
||||||
while (it != end) {
|
|
||||||
json call;
|
|
||||||
if (!parse_json(it, end, call)) {
|
|
||||||
throw std::runtime_error("Failed to parse json tool call");
|
|
||||||
}
|
|
||||||
const auto & arguments = call["arguments"];
|
|
||||||
result.tool_calls.push_back({
|
|
||||||
call["name"],
|
|
||||||
arguments.dump(),
|
|
||||||
// arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
|
||||||
/* id= */ "",
|
|
||||||
});
|
|
||||||
rit = {it, end, middle_pattern};
|
|
||||||
if (rit != rend) {
|
|
||||||
it = rit->suffix().first;
|
|
||||||
} else {
|
|
||||||
rit = {it, end, end_pattern};
|
|
||||||
if (rit == rend) {
|
|
||||||
throw std::runtime_error("Malformed input, missing </tool_call>");
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
} catch (const std::exception & e) {
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ input,
|
|
||||||
/* .tool_calls = */ {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
common_chat_params data;
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
|
||||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
||||||
data.grammar_lazy = false;
|
|
||||||
if (!inputs.json_schema.is_null()) {
|
|
||||||
if (!inputs.grammar.empty()) {
|
|
||||||
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
|
||||||
}
|
|
||||||
data.grammar = json_schema_to_grammar(inputs.json_schema);
|
|
||||||
} else {
|
|
||||||
data.grammar = inputs.grammar.empty();
|
|
||||||
}
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
|
||||||
auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none";
|
|
||||||
LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false");
|
|
||||||
|
|
||||||
if (has_tools && !inputs.grammar.empty()) {
|
|
||||||
throw std::runtime_error("Cannot specify grammar with tools");
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto & src = tmpl.source();
|
|
||||||
if (src.find(">>>all") != std::string::npos) {
|
|
||||||
// Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when
|
|
||||||
return common_chat_params_init_functionary_v3_2(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find(" functools[") != std::string::npos) {
|
|
||||||
// Firefunction v2 requires datetime and functions in the context, even w/o tools.
|
|
||||||
return common_chat_params_init_firefunction_v2(tmpl, inputs);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!has_tools) {
|
|
||||||
return common_chat_params_init_without_tools(tmpl, inputs);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (src.find("<tool_call>") != std::string::npos) {
|
|
||||||
return common_chat_params_init_hermes_2_pro(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find("<|start_header_id|>") != std::string::npos
|
|
||||||
&& src.find("<function=") != std::string::npos) {
|
|
||||||
return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
|
|
||||||
auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
|
|
||||||
return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools);
|
|
||||||
}
|
|
||||||
if (src.find("<|tool▁calls▁begin|>") != std::string::npos) {
|
|
||||||
return common_chat_params_init_deepseek_r1(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find("[TOOL_CALLS]") != std::string::npos) {
|
|
||||||
return common_chat_params_init_mistral_nemo(tmpl, inputs);
|
|
||||||
}
|
|
||||||
if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) {
|
|
||||||
return common_chat_params_init_command_r7b(tmpl, inputs);
|
|
||||||
}
|
|
||||||
return common_chat_params_init_generic(tmpl, inputs);
|
|
||||||
}
|
|
||||||
|
|
||||||
static common_chat_msg common_chat_parse_content_only(const std::string & input) {
|
|
||||||
return {
|
|
||||||
/* .role = */ "assistant",
|
|
||||||
/* .content = */ input,
|
|
||||||
/* .tool_calls = */ {},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) {
|
|
||||||
switch (format) {
|
|
||||||
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
|
|
||||||
return common_chat_parse_content_only(input);
|
|
||||||
case COMMON_CHAT_FORMAT_GENERIC:
|
|
||||||
return common_chat_parse_generic(input);
|
|
||||||
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
|
|
||||||
return common_chat_parse_mistral_nemo(input);
|
|
||||||
case COMMON_CHAT_FORMAT_LLAMA_3_X:
|
|
||||||
return common_chat_parse_llama_3_1(input);
|
|
||||||
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
|
|
||||||
return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
|
|
||||||
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
|
|
||||||
return common_chat_parse_deepseek_r1(input);
|
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
|
|
||||||
return common_chat_parse_functionary_v3_2(input);
|
|
||||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
|
|
||||||
return common_chat_parse_functionary_v3_1_llama_3_1(input);
|
|
||||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
|
|
||||||
return common_chat_parse_hermes_2_pro(input);
|
|
||||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
|
|
||||||
return common_chat_parse_firefunction_v2(input);
|
|
||||||
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
|
||||||
return common_chat_parse_command_r7b(input);
|
|
||||||
default:
|
|
||||||
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,52 +0,0 @@
|
||||||
// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include <json.hpp>
|
|
||||||
#include <optional>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
|
||||||
|
|
||||||
struct common_chat_inputs {
|
|
||||||
json messages;
|
|
||||||
json tools;
|
|
||||||
json tool_choice;
|
|
||||||
json json_schema;
|
|
||||||
bool parallel_tool_calls;
|
|
||||||
bool stream;
|
|
||||||
std::string grammar;
|
|
||||||
bool add_generation_prompt = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
enum common_chat_format {
|
|
||||||
COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
|
||||||
COMMON_CHAT_FORMAT_GENERIC,
|
|
||||||
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
|
|
||||||
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
|
||||||
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
|
||||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
|
||||||
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
|
||||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
|
||||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
||||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
||||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
||||||
|
|
||||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_chat_params {
|
|
||||||
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
||||||
json prompt;
|
|
||||||
std::string grammar;
|
|
||||||
bool grammar_lazy = false;
|
|
||||||
std::vector<common_grammar_trigger> grammar_triggers;
|
|
||||||
std::vector<std::string> preserved_tokens;
|
|
||||||
std::vector<std::string> additional_stops;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
|
|
||||||
std::string common_chat_format_name(common_chat_format format);
|
|
||||||
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue