Compare commits
1 commit
master
...
gg/check-p
Author | SHA1 | Date | |
---|---|---|---|
|
4356325ef5 |
1119 changed files with 267471 additions and 243075 deletions
161
.clang-format
161
.clang-format
|
@ -1,161 +0,0 @@
|
|||
---
|
||||
Language: Cpp
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignArrayOfStructures: Left
|
||||
AlignConsecutiveAssignments: AcrossComments
|
||||
AlignConsecutiveBitFields: AcrossComments
|
||||
AlignConsecutiveDeclarations: AcrossComments
|
||||
AlignConsecutiveMacros: AcrossComments
|
||||
# AlignConsecutiveShortCaseStatements: AcrossComments
|
||||
AlignEscapedNewlines: Left # LeftWithLastLine
|
||||
AlignOperands: Align
|
||||
AlignTrailingComments:
|
||||
Kind: Always
|
||||
OverEmptyLines: 1
|
||||
AllowAllArgumentsOnNextLine: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: false
|
||||
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
|
||||
AllowShortBlocksOnASingleLine: Never
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: Inline
|
||||
AllowShortIfStatementsOnASingleLine: Never
|
||||
AllowShortLambdasOnASingleLine: Inline
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true # OnePerLine
|
||||
BitFieldColonSpacing: Both
|
||||
BreakBeforeBraces: Custom # Attach
|
||||
BraceWrapping:
|
||||
AfterCaseLabel: true
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
BeforeLambdaBody: false
|
||||
BeforeWhile: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: false
|
||||
SplitEmptyRecord: false
|
||||
SplitEmptyNamespace: false
|
||||
# BreakAdjacentStringLiterals: true
|
||||
BreakAfterAttributes: Never
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeInlineASMColon: OnlyMultiline
|
||||
BreakBeforeTernaryOperators: false
|
||||
# BreakBinaryOperations: Never
|
||||
BreakConstructorInitializers: AfterColon
|
||||
# BreakFunctionDefinitionParameters: false
|
||||
BreakInheritanceList: AfterComma
|
||||
BreakStringLiterals: true
|
||||
# BreakTemplateDeclarations: Yes
|
||||
ColumnLimit: 120
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: false
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
EmptyLineBeforeAccessModifier: Leave
|
||||
EmptyLineAfterAccessModifier: Never
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: true
|
||||
IncludeBlocks: Regroup
|
||||
IncludeCategories:
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 1
|
||||
SortPriority: 0
|
||||
- Regex: '^<.*'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
- Regex: '.*'
|
||||
Priority: 3
|
||||
SortPriority: 0
|
||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||
IncludeIsMainSourceRegex: ''
|
||||
IndentAccessModifiers: false
|
||||
IndentCaseBlocks: true
|
||||
IndentCaseLabels: true
|
||||
IndentExternBlock: NoIndent
|
||||
IndentGotoLabels: false
|
||||
IndentPPDirectives: AfterHash
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
InsertBraces: true # NOTE: may lead to incorrect formatting
|
||||
InsertNewlineAtEOF: true
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
LambdaBodyIndentation: Signature
|
||||
LineEnding: LF
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBinPackProtocolList: Auto
|
||||
ObjCBlockIndentWidth: 4
|
||||
ObjCSpaceAfterProperty: true
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PPIndentWidth: -1
|
||||
PackConstructorInitializers: CurrentLine
|
||||
PenaltyBreakAssignment: 2
|
||||
PenaltyBreakBeforeFirstCallParameter: 1
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyBreakTemplateDeclaration: 10
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Middle
|
||||
QualifierAlignment: Left
|
||||
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
|
||||
RawStringFormats:
|
||||
- Language: Cpp
|
||||
Delimiters:
|
||||
- cc
|
||||
- CC
|
||||
- cpp
|
||||
- Cpp
|
||||
- CPP
|
||||
- 'c++'
|
||||
- 'C++'
|
||||
CanonicalDelimiter: ''
|
||||
ReferenceAlignment: Middle
|
||||
ReflowComments: false # IndentOnly
|
||||
SeparateDefinitionBlocks: Always
|
||||
SortIncludes: CaseInsensitive
|
||||
SortUsingDeclarations: LexicographicNumeric
|
||||
SpaceAfterCStyleCast: true
|
||||
SpaceAfterLogicalNot: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCpp11BracedList: false
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceInEmptyBlock: false
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 2
|
||||
SpacesInAngles: Never
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInLineCommentPrefix:
|
||||
Minimum: 1
|
||||
Maximum: -1
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
SpaceBeforeSquareBrackets: false
|
||||
Standard: c++17
|
||||
TabWidth: 4
|
||||
UseTab: Never
|
||||
WhitespaceSensitiveMacros: ['STRINGIZE']
|
||||
...
|
||||
|
|
@ -17,10 +17,8 @@ Checks: >
|
|||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||
performance-*,
|
||||
portability-*,
|
||||
-portability-simd-intrinsics,
|
||||
misc-*,
|
||||
-misc-const-correctness,
|
||||
-misc-non-private-member-variables-in-classes,
|
||||
-misc-no-recursion,
|
||||
-misc-use-anonymous-namespace,
|
||||
FormatStyle: none
|
||||
|
|
|
@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
|||
stage('Running llama.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||
cat llama_log.txt # Printing results
|
||||
'''
|
||||
}
|
||||
|
|
|
@ -1,92 +0,0 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
ARG GGML_CPU_ARM_ARCH=armv8-a
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
||||
else \
|
||||
echo "Unsupported architecture"; \
|
||||
exit 1; \
|
||||
fi && \
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -1,94 +0,0 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=12.6.0
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
36
.devops/full-cuda.Dockerfile
Normal file
36
.devops/full-cuda.Dockerfile
Normal file
|
@ -0,0 +1,36 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV LLAMA_CUDA=1
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
RUN make -j$(nproc)
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
50
.devops/full-rocm.Dockerfile
Normal file
50
.devops/full-rocm.Dockerfile
Normal file
|
@ -0,0 +1,50 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=5.6
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH=\
|
||||
gfx803 \
|
||||
gfx900 \
|
||||
gfx906 \
|
||||
gfx908 \
|
||||
gfx90a \
|
||||
gfx1010 \
|
||||
gfx1030 \
|
||||
gfx1100 \
|
||||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
ENV LLAMA_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
|
||||
RUN make -j$(nproc)
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
25
.devops/full.Dockerfile
Normal file
25
.devops/full.Dockerfile
Normal file
|
@ -0,0 +1,25 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
|
||||
RUN make -j$(nproc)
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,91 +0,0 @@
|
|||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||
|
||||
## Build Image
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
echo "GGML_SYCL_F16 is set" \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
echo "Building with dynamic libs" && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
|
||||
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN yum install -y gcc g++ cmake make
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
||||
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
|
||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
|
||||
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
|
||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
||||
|
||||
# find libascend_hal.so, because the drive hasn`t been mounted.
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||
|
||||
RUN echo "Building with static libs" && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
# TODO: use image with NNRT
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
||||
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
|
||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
|
||||
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
|
||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
||||
|
||||
ENTRYPOINT ["/llama-cli" ]
|
84
.devops/llama-cpp-clblast.srpm.spec
Normal file
84
.devops/llama-cpp-clblast.srpm.spec
Normal file
|
@ -0,0 +1,84 @@
|
|||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||
|
||||
# Notes for llama.cpp:
|
||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||
# We need to declare standard versioning if people want to sort latest releases.
|
||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp-clblast
|
||||
Version: %( date "+%%Y%%m%%d" )
|
||||
Release: 1%{?dist}
|
||||
Summary: OpenCL Inference of LLaMA model in C/C++
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
|
||||
Requires: clblast
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
%define source_date_epoch_from_changelog 0
|
||||
|
||||
%description
|
||||
CPU inference for Meta's Lllama2 models using default options.
|
||||
|
||||
%prep
|
||||
%setup -n llama.cpp-master
|
||||
|
||||
%build
|
||||
make -j LLAMA_CLBLAST=1
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
||||
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
||||
[Unit]
|
||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
EOF
|
||||
|
||||
mkdir -p %{buildroot}/etc/sysconfig
|
||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
||||
EOF
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llamaclblast
|
||||
%{_bindir}/llamaclblastserver
|
||||
%{_bindir}/llamaclblastsimple
|
||||
/usr/lib/systemd/system/llamaclblast.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
|
||||
%pre
|
||||
|
||||
%post
|
||||
|
||||
%preun
|
||||
%postun
|
||||
|
||||
%changelog
|
|
@ -32,13 +32,13 @@ CPU inference for Meta's Lllama2 models using default options.
|
|||
%setup -n llama.cpp-master
|
||||
|
||||
%build
|
||||
make -j GGML_CUDA=1
|
||||
make -j LLAMA_CUDA=1
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||
cp -p main %{buildroot}%{_bindir}/llamacppcuda
|
||||
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
|||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
||||
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
|||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llama-cuda-cli
|
||||
%{_bindir}/llama-cuda-server
|
||||
%{_bindir}/llama-cuda-simple
|
||||
%{_bindir}/llamacppcuda
|
||||
%{_bindir}/llamacppcudaserver
|
||||
%{_bindir}/llamacppcudasimple
|
||||
/usr/lib/systemd/system/llamacuda.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
|
|
|
@ -38,9 +38,9 @@ make -j
|
|||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||
cp -p main %{buildroot}%{_bindir}/llama
|
||||
cp -p server %{buildroot}%{_bindir}/llamaserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||
|
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
|||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
||||
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
|
@ -69,9 +69,9 @@ rm -rf %{buildroot}
|
|||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llama-cli
|
||||
%{_bindir}/llama-server
|
||||
%{_bindir}/llama-simple
|
||||
%{_bindir}/llama
|
||||
%{_bindir}/llamaserver
|
||||
%{_bindir}/llamasimple
|
||||
/usr/lib/systemd/system/llama.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
|
|
35
.devops/main-cuda.Dockerfile
Normal file
35
.devops/main-cuda.Dockerfile
Normal file
|
@ -0,0 +1,35 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV LLAMA_CUDA=1
|
||||
|
||||
RUN make -j$(nproc) main
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/main /main
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
34
.devops/main-intel.Dockerfile
Normal file
34
.devops/main-intel.Dockerfile
Normal file
|
@ -0,0 +1,34 @@
|
|||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
ARG LLAMA_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||
echo "LLAMA_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target main
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/main /main
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
45
.devops/main-rocm.Dockerfile
Normal file
45
.devops/main-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=5.6
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH=\
|
||||
gfx803 \
|
||||
gfx900 \
|
||||
gfx906 \
|
||||
gfx908 \
|
||||
gfx90a \
|
||||
gfx1010 \
|
||||
gfx1030 \
|
||||
gfx1100 \
|
||||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
ENV LLAMA_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN make -j$(nproc) main
|
||||
|
||||
ENTRYPOINT [ "/app/main" ]
|
27
.devops/main-vulkan.Dockerfile
Normal file
27
.devops/main-vulkan.Dockerfile
Normal file
|
@ -0,0 +1,27 @@
|
|||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
||||
|
||||
# Install Vulkan SDK
|
||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||
apt update -y && \
|
||||
apt-get install -y vulkan-sdk
|
||||
|
||||
# Build it
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
||||
cmake --build build --config Release --target main
|
||||
|
||||
# Clean up
|
||||
WORKDIR /
|
||||
RUN cp /app/build/bin/main /main && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
23
.devops/main.Dockerfile
Normal file
23
.devops/main.Dockerfile
Normal file
|
@ -0,0 +1,23 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN make -j$(nproc) main
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/main /main
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
|
@ -1,108 +0,0 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc3.1.0
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
# MUSA architecture to build for (defaults to all supported archs)
|
||||
ARG MUSA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
cmake \
|
||||
python3 \
|
||||
python3-pip \
|
||||
git \
|
||||
libcurl4-openssl-dev \
|
||||
libgomp1
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Use the default MUSA archs if not specified
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -6,10 +6,11 @@
|
|||
let
|
||||
inherit (config.packages) default;
|
||||
binaries = [
|
||||
"llama-cli"
|
||||
"llama"
|
||||
"llama-embedding"
|
||||
"llama-server"
|
||||
"llama-quantize"
|
||||
"quantize"
|
||||
"train-text-from-scratch"
|
||||
];
|
||||
mkApp = name: {
|
||||
type = "app";
|
||||
|
|
|
@ -1,52 +1,13 @@
|
|||
{ inputs, ... }:
|
||||
|
||||
{
|
||||
perSystem =
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
system,
|
||||
...
|
||||
}:
|
||||
{ config, lib, ... }:
|
||||
{
|
||||
devShells =
|
||||
let
|
||||
pkgs = import inputs.nixpkgs { inherit system; };
|
||||
stdenv = pkgs.stdenv;
|
||||
scripts = config.packages.python-scripts;
|
||||
in
|
||||
lib.pipe (config.packages) [
|
||||
(lib.concatMapAttrs (
|
||||
name: package: {
|
||||
${name} = pkgs.mkShell {
|
||||
name = "${name}";
|
||||
inputsFrom = [ package ];
|
||||
shellHook = ''
|
||||
echo "Entering ${name} devShell"
|
||||
'';
|
||||
};
|
||||
"${name}-extra" =
|
||||
if (name == "python-scripts") then
|
||||
null
|
||||
else
|
||||
pkgs.mkShell {
|
||||
name = "${name}-extra";
|
||||
inputsFrom = [
|
||||
package
|
||||
scripts
|
||||
];
|
||||
# Extra packages that *may* be used by some scripts
|
||||
packages = [
|
||||
pkgs.python3Packages.tiktoken
|
||||
];
|
||||
shellHook = ''
|
||||
echo "Entering ${name} devShell"
|
||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
|
||||
'';
|
||||
};
|
||||
}
|
||||
))
|
||||
(lib.filterAttrs (name: value: value != null))
|
||||
];
|
||||
lib.concatMapAttrs
|
||||
(name: package: {
|
||||
${name} = package.passthru.shell;
|
||||
${name + "-extra"} = package.passthru.shell-extra;
|
||||
})
|
||||
config.packages;
|
||||
};
|
||||
}
|
||||
|
|
|
@ -26,14 +26,16 @@
|
|||
config.cudaSupport = true;
|
||||
config.allowUnfreePredicate =
|
||||
p:
|
||||
builtins.all (
|
||||
builtins.all
|
||||
(
|
||||
license:
|
||||
license.free
|
||||
|| builtins.elem license.shortName [
|
||||
"CUDA EULA"
|
||||
"cuDNN EULA"
|
||||
]
|
||||
) (p.meta.licenses or [ p.meta.license ]);
|
||||
)
|
||||
(p.meta.licenses or [ p.meta.license ]);
|
||||
};
|
||||
# Ensure dependencies use ROCm consistently
|
||||
pkgsRocm = import inputs.nixpkgs {
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
{
|
||||
lib,
|
||||
llamaVersion,
|
||||
numpy,
|
||||
tqdm,
|
||||
sentencepiece,
|
||||
pyyaml,
|
||||
poetry-core,
|
||||
buildPythonPackage,
|
||||
pytestCheckHook,
|
||||
}:
|
||||
|
||||
buildPythonPackage {
|
||||
pname = "gguf";
|
||||
version = llamaVersion;
|
||||
pyproject = true;
|
||||
nativeBuildInputs = [ poetry-core ];
|
||||
propagatedBuildInputs = [
|
||||
numpy
|
||||
tqdm
|
||||
sentencepiece
|
||||
pyyaml
|
||||
];
|
||||
src = lib.cleanSource ../../gguf-py;
|
||||
pythonImportsCheck = [
|
||||
"numpy"
|
||||
"gguf"
|
||||
];
|
||||
nativeCheckInputs = [ pytestCheckHook ];
|
||||
doCheck = true;
|
||||
meta = with lib; {
|
||||
description = "Python package for writing binary files in the GGUF format";
|
||||
license = licenses.mit;
|
||||
maintainers = [ maintainers.ditsuke ];
|
||||
};
|
||||
}
|
|
@ -3,36 +3,33 @@
|
|||
glibc,
|
||||
config,
|
||||
stdenv,
|
||||
mkShell,
|
||||
runCommand,
|
||||
cmake,
|
||||
ninja,
|
||||
pkg-config,
|
||||
git,
|
||||
python3,
|
||||
mpi,
|
||||
blas,
|
||||
cudaPackages,
|
||||
autoAddDriverRunpath,
|
||||
darwin,
|
||||
rocmPackages,
|
||||
vulkan-headers,
|
||||
vulkan-loader,
|
||||
curl,
|
||||
shaderc,
|
||||
useBlas ?
|
||||
builtins.all (x: !x) [
|
||||
clblast,
|
||||
useBlas ? builtins.all (x: !x) [
|
||||
useCuda
|
||||
useMetalKit
|
||||
useOpenCL
|
||||
useRocm
|
||||
useVulkan
|
||||
]
|
||||
&& blas.meta.available,
|
||||
] && blas.meta.available,
|
||||
useCuda ? config.cudaSupport,
|
||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
||||
# Increases the runtime closure size by ~700M
|
||||
useMpi ? false,
|
||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||
useOpenCL ? false,
|
||||
useRocm ? config.rocmSupport,
|
||||
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
|
||||
enableCurl ? true,
|
||||
useVulkan ? false,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
|
||||
|
@ -40,8 +37,8 @@
|
|||
# otherwise we get libstdc++ errors downstream.
|
||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
||||
precompileMetalShaders ? false,
|
||||
}:
|
||||
precompileMetalShaders ? false
|
||||
}@inputs:
|
||||
|
||||
let
|
||||
inherit (lib)
|
||||
|
@ -49,6 +46,7 @@ let
|
|||
cmakeFeature
|
||||
optionals
|
||||
strings
|
||||
versionOlder
|
||||
;
|
||||
|
||||
stdenv = throw "Use effectiveStdenv instead";
|
||||
|
@ -58,15 +56,43 @@ let
|
|||
++ lib.optionals useCuda [ "CUDA" ]
|
||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||
++ lib.optionals useMpi [ "MPI" ]
|
||||
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||
++ lib.optionals useRocm [ "ROCm" ]
|
||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||
|
||||
pnameSuffix =
|
||||
strings.optionalString (suffices != [ ])
|
||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
||||
descriptionSuffix = strings.optionalString (
|
||||
suffices != [ ]
|
||||
) ", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||
descriptionSuffix =
|
||||
strings.optionalString (suffices != [ ])
|
||||
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||
|
||||
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
|
||||
|
||||
# TODO: package the Python in this repository in a Nix-like way.
|
||||
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
||||
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
||||
# https://peps.python.org/pep-0517/
|
||||
#
|
||||
# TODO: Package up each Python script or service appropriately, by making
|
||||
# them into "entrypoints"
|
||||
llama-python = python3.withPackages (
|
||||
ps: [
|
||||
ps.numpy
|
||||
ps.sentencepiece
|
||||
]
|
||||
);
|
||||
|
||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||
llama-python-extra = python3.withPackages (
|
||||
ps: [
|
||||
ps.numpy
|
||||
ps.sentencepiece
|
||||
ps.tiktoken
|
||||
ps.torchWithoutCuda
|
||||
ps.transformers
|
||||
]
|
||||
);
|
||||
|
||||
xcrunHost = runCommand "xcrunHost" {} ''
|
||||
mkdir -p $out/bin
|
||||
|
@ -85,9 +111,16 @@ let
|
|||
++ optionals useMetalKit [ MetalKit ];
|
||||
|
||||
cudaBuildInputs = with cudaPackages; [
|
||||
cuda_cudart
|
||||
cuda_cccl # <nv/target>
|
||||
libcublas
|
||||
cuda_cccl.dev # <nv/target>
|
||||
|
||||
# A temporary hack for reducing the closure size, remove once cudaPackages
|
||||
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
||||
cuda_cudart.dev
|
||||
cuda_cudart.lib
|
||||
cuda_cudart.static
|
||||
libcublas.dev
|
||||
libcublas.lib
|
||||
libcublas.static
|
||||
];
|
||||
|
||||
rocmBuildInputs = with rocmPackages; [
|
||||
|
@ -99,11 +132,11 @@ let
|
|||
vulkanBuildInputs = [
|
||||
vulkan-headers
|
||||
vulkan-loader
|
||||
shaderc
|
||||
];
|
||||
in
|
||||
|
||||
effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
effectiveStdenv.mkDerivation (
|
||||
finalAttrs: {
|
||||
pname = "llama-cpp${pnameSuffix}";
|
||||
version = llamaVersion;
|
||||
|
||||
|
@ -127,9 +160,9 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
};
|
||||
|
||||
postPatch = ''
|
||||
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||
'';
|
||||
|
||||
|
@ -151,33 +184,38 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
++ optionals useCuda [
|
||||
cudaPackages.cuda_nvcc
|
||||
|
||||
autoAddDriverRunpath
|
||||
# TODO: Replace with autoAddDriverRunpath
|
||||
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
||||
cudaPackages.autoAddOpenGLRunpathHook
|
||||
]
|
||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
|
||||
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
|
||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
|
||||
glibc.static
|
||||
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
|
||||
xcrunHost
|
||||
];
|
||||
|
||||
buildInputs =
|
||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||
++ optionals useCuda cudaBuildInputs
|
||||
++ optionals useMpi [ mpi ]
|
||||
++ optionals useOpenCL [ clblast ]
|
||||
++ optionals useRocm rocmBuildInputs
|
||||
++ optionals useBlas [ blas ]
|
||||
++ optionals useVulkan vulkanBuildInputs
|
||||
++ optionals enableCurl [ curl ];
|
||||
++ optionals useVulkan vulkanBuildInputs;
|
||||
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_NATIVE" false)
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||
(cmakeBool "GGML_NATIVE" false)
|
||||
(cmakeBool "GGML_BLAS" useBlas)
|
||||
(cmakeBool "GGML_CUDA" useCuda)
|
||||
(cmakeBool "GGML_HIP" useRocm)
|
||||
(cmakeBool "GGML_METAL" useMetalKit)
|
||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||
(cmakeBool "GGML_STATIC" enableStatic)
|
||||
(cmakeBool "LLAMA_BLAS" useBlas)
|
||||
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
||||
(cmakeBool "LLAMA_CUDA" useCuda)
|
||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
||||
(cmakeBool "LLAMA_STATIC" enableStatic)
|
||||
]
|
||||
++ optionals useCuda [
|
||||
(
|
||||
|
@ -189,11 +227,11 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
]
|
||||
++ optionals useRocm [
|
||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
|
||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
||||
]
|
||||
++ optionals useMetalKit [
|
||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||
];
|
||||
|
||||
# Environment variables needed for ROCm
|
||||
|
@ -205,15 +243,47 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||
# if they haven't been added yet.
|
||||
postInstall = ''
|
||||
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
|
||||
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
|
||||
mkdir -p $out/include
|
||||
cp $src/include/llama.h $out/include/
|
||||
cp $src/llama.h $out/include/
|
||||
'';
|
||||
|
||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
||||
passthru = {
|
||||
inherit
|
||||
useBlas
|
||||
useCuda
|
||||
useMetalKit
|
||||
useMpi
|
||||
useOpenCL
|
||||
useRocm
|
||||
useVulkan
|
||||
;
|
||||
|
||||
shell = mkShell {
|
||||
name = "shell-${finalAttrs.finalPackage.name}";
|
||||
description = "contains numpy and sentencepiece";
|
||||
buildInputs = [ llama-python ];
|
||||
inputsFrom = [ finalAttrs.finalPackage ];
|
||||
shellHook = ''
|
||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
||||
'';
|
||||
};
|
||||
|
||||
shell-extra = mkShell {
|
||||
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
||||
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
||||
buildInputs = [ llama-python-extra ];
|
||||
inputsFrom = [ finalAttrs.finalPackage ];
|
||||
};
|
||||
};
|
||||
|
||||
meta = {
|
||||
# Configurations we don't want even the CI to evaluate. Results in the
|
||||
# "unsupported platform" messages. This is mostly a no-op, because
|
||||
# cudaPackages would've refused to evaluate anyway.
|
||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||
|
||||
# Configurations that are known to result in build failures. Can be
|
||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||
|
@ -224,7 +294,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
license = lib.licenses.mit;
|
||||
|
||||
# Accommodates `nix run` and `lib.getExe`
|
||||
mainProgram = "llama-cli";
|
||||
mainProgram = "llama";
|
||||
|
||||
# These people might respond, on the best effort basis, if you ping them
|
||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||
|
@ -244,4 +314,5 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
# Extend `badPlatforms` instead
|
||||
platforms = lib.platforms.all;
|
||||
};
|
||||
})
|
||||
}
|
||||
)
|
||||
|
|
|
@ -1,66 +0,0 @@
|
|||
{
|
||||
lib,
|
||||
stdenv,
|
||||
buildPythonPackage,
|
||||
poetry-core,
|
||||
mkShell,
|
||||
python3Packages,
|
||||
gguf-py,
|
||||
}@inputs:
|
||||
|
||||
let
|
||||
llama-python-deps = with python3Packages; [
|
||||
numpy
|
||||
sentencepiece
|
||||
transformers
|
||||
protobuf
|
||||
torchWithoutCuda
|
||||
gguf-py
|
||||
tqdm
|
||||
|
||||
# for scripts/compare-llama-bench.py
|
||||
gitpython
|
||||
tabulate
|
||||
|
||||
# for examples/pydantic-models-to-grammar-examples.py
|
||||
docstring-parser
|
||||
pydantic
|
||||
|
||||
];
|
||||
|
||||
llama-python-test-deps = with python3Packages; [
|
||||
# Server bench
|
||||
matplotlib
|
||||
|
||||
# server tests
|
||||
openai
|
||||
pytest
|
||||
prometheus-client
|
||||
];
|
||||
in
|
||||
|
||||
buildPythonPackage ({
|
||||
pname = "llama-scripts";
|
||||
version = "0.0.0";
|
||||
pyproject = true;
|
||||
|
||||
# NOTE: The files filtered out here are not visible in the build sandbox, neither
|
||||
# do they affect the output hash. They can be modified without triggering a rebuild.
|
||||
src = lib.cleanSourceWith {
|
||||
filter =
|
||||
name: type:
|
||||
let
|
||||
any = builtins.any (x: x);
|
||||
baseName = builtins.baseNameOf name;
|
||||
in
|
||||
any [
|
||||
(lib.hasSuffix ".py" name)
|
||||
(baseName == "README.md")
|
||||
(baseName == "pyproject.toml")
|
||||
];
|
||||
src = lib.cleanSource ../../.;
|
||||
};
|
||||
nativeBuildInputs = [ poetry-core ];
|
||||
nativeCheckInputs = llama-python-test-deps;
|
||||
dependencies = llama-python-deps;
|
||||
})
|
|
@ -1,41 +1,19 @@
|
|||
{
|
||||
lib,
|
||||
newScope,
|
||||
python3,
|
||||
llamaVersion ? "0.0.0",
|
||||
}:
|
||||
|
||||
let
|
||||
pythonPackages = python3.pkgs;
|
||||
buildPythonPackage = pythonPackages.buildPythonPackage;
|
||||
numpy = pythonPackages.numpy;
|
||||
tqdm = pythonPackages.tqdm;
|
||||
sentencepiece = pythonPackages.sentencepiece;
|
||||
pyyaml = pythonPackages.pyyaml;
|
||||
poetry-core = pythonPackages.poetry-core;
|
||||
pytestCheckHook = pythonPackages.pytestCheckHook;
|
||||
in
|
||||
|
||||
# We're using `makeScope` instead of just writing out an attrset
|
||||
# because it allows users to apply overlays later using `overrideScope'`.
|
||||
# Cf. https://noogle.dev/f/lib/makeScope
|
||||
|
||||
lib.makeScope newScope (self: {
|
||||
lib.makeScope newScope (
|
||||
self: {
|
||||
inherit llamaVersion;
|
||||
gguf-py = self.callPackage ./package-gguf-py.nix {
|
||||
inherit
|
||||
buildPythonPackage
|
||||
numpy
|
||||
tqdm
|
||||
sentencepiece
|
||||
poetry-core
|
||||
pyyaml
|
||||
pytestCheckHook
|
||||
;
|
||||
};
|
||||
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
||||
llama-cpp = self.callPackage ./package.nix { };
|
||||
docker = self.callPackage ./docker.nix { };
|
||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||
sif = self.callPackage ./sif.nix { };
|
||||
})
|
||||
}
|
||||
)
|
||||
|
|
|
@ -1,113 +0,0 @@
|
|||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=6.3
|
||||
ARG AMDGPU_VERSION=6.3
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
### Build image
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
|
||||
# gfx906 is deprecated
|
||||
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
|
||||
|
||||
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
||||
ARG ROCM_DOCKER_ARCH=gfx1100
|
||||
|
||||
# Set nvcc architectured
|
||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
# ENV CC=/opt/rocm/llvm/bin/clang
|
||||
# ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
libcurl4-openssl-dev \
|
||||
curl \
|
||||
libgomp1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
||||
&& cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib \
|
||||
&& find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3-pip \
|
||||
python3 \
|
||||
python3-wheel\
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
37
.devops/server-cuda.Dockerfile
Normal file
37
.devops/server-cuda.Dockerfile
Normal file
|
@ -0,0 +1,37 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV LLAMA_CUDA=1
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
RUN make -j$(nproc) server
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||
|
||||
COPY --from=build /app/server /server
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
45
.devops/server-intel.Dockerfile
Normal file
45
.devops/server-intel.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
|||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
ARG LLAMA_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||
echo "LLAMA_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target server
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
|
||||
COPY --from=build /app/build/bin/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
50
.devops/server-rocm.Dockerfile
Normal file
50
.devops/server-rocm.Dockerfile
Normal file
|
@ -0,0 +1,50 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=5.6
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH=\
|
||||
gfx803 \
|
||||
gfx900 \
|
||||
gfx906 \
|
||||
gfx908 \
|
||||
gfx90a \
|
||||
gfx1010 \
|
||||
gfx1030 \
|
||||
gfx1100 \
|
||||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
ENV LLAMA_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
|
||||
RUN make -j$(nproc)
|
||||
|
||||
ENTRYPOINT [ "/app/server" ]
|
31
.devops/server-vulkan.Dockerfile
Normal file
31
.devops/server-vulkan.Dockerfile
Normal file
|
@ -0,0 +1,31 @@
|
|||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget
|
||||
|
||||
# Install Vulkan SDK
|
||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||
apt update -y && \
|
||||
apt-get install -y vulkan-sdk
|
||||
|
||||
# Install cURL
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
|
||||
# Build it
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||
cmake --build build --config Release --target server
|
||||
|
||||
# Clean up
|
||||
WORKDIR /
|
||||
RUN cp /app/build/bin/server /server && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
25
.devops/server.Dockerfile
Normal file
25
.devops/server.Dockerfile
Normal file
|
@ -0,0 +1,25 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
RUN make -j$(nproc) server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||
|
||||
COPY --from=build /app/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
|
@ -8,40 +8,36 @@ arg1="$1"
|
|||
shift
|
||||
|
||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
exec python3 ./convert_hf_to_gguf.py "$@"
|
||||
python3 ./convert-hf-to-gguf.py "$@"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
exec ./llama-quantize "$@"
|
||||
./quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
exec ./llama-cli "$@"
|
||||
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
||||
exec ./llama-bench "$@"
|
||||
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
||||
exec ./llama-perplexity "$@"
|
||||
./main "$@"
|
||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||
./finetune "$@"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in $(ls $1/$2/ggml-model-f16.bin*); do
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
if [ -f "${i/f16/q4_0}" ]; then
|
||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||
else
|
||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
fi
|
||||
done
|
||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||
exec ./llama-server "$@"
|
||||
./server "$@"
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
echo " --run (-r): Run a model previously converted into ggml"
|
||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
||||
echo " ex: -m model.gguf"
|
||||
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
||||
echo " ex: -m model.gguf -f file.txt"
|
||||
echo " --convert (-c): Convert a llama model into ggml"
|
||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
|
||||
echo " See documentation for finetune for command-line parameters"
|
||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||
echo " ex: \"/models/\" 7B"
|
||||
echo " --server (-s): Run a model on the server"
|
||||
|
|
|
@ -1,89 +0,0 @@
|
|||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget
|
||||
|
||||
# Install Vulkan SDK and cURL
|
||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
|
||||
apt update -y && \
|
||||
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
||||
|
||||
# Build it
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl libvulkan-dev \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -1,7 +1,7 @@
|
|||
*.o
|
||||
*.a
|
||||
.cache/
|
||||
# Do not ignore .git directory, otherwise the reported build number will always be 0
|
||||
.git/
|
||||
.github/
|
||||
.gitignore
|
||||
.vs/
|
||||
|
@ -12,8 +12,8 @@ build*/
|
|||
|
||||
models/*
|
||||
|
||||
/llama-cli
|
||||
/llama-quantize
|
||||
/main
|
||||
/quantize
|
||||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
|
2
.ecrc
2
.ecrc
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
|
||||
"Exclude": ["^\\.gitmodules$"],
|
||||
"Disable": {
|
||||
"IndentSize": true
|
||||
}
|
||||
|
|
|
@ -24,27 +24,5 @@ insert_final_newline = unset
|
|||
[examples/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
[examples/server/public/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[examples/server/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||
indent_style = tab
|
||||
|
||||
[examples/cvector-generator/*.txt]
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
[models/templates/*.jinja]
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
end_of_line = unset
|
||||
charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
|
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: Low Severity Bugs
|
||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "low severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
87
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
87
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
|
@ -1,87 +0,0 @@
|
|||
name: Bug (compilation)
|
||||
description: Something goes wrong when trying to compile llama.cpp.
|
||||
title: "Compile bug: "
|
||||
labels: ["bug-unconfirmed", "compilation"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
This issue template is intended for bug reports where the compilation of llama.cpp fails.
|
||||
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
|
||||
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
||||
by clearing `~/.cache/ccache` (on Linux).
|
||||
- type: textarea
|
||||
id: commit
|
||||
attributes:
|
||||
label: Git commit
|
||||
description: Which commit are you trying to compile?
|
||||
placeholder: |
|
||||
$git rev-parse HEAD
|
||||
84a07a17b1b08cf2b9747c633a2372782848a27f
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: Operating systems
|
||||
description: Which operating systems do you know to be affected?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: backends
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: info
|
||||
attributes:
|
||||
label: Problem description & steps to reproduce
|
||||
description: >
|
||||
Please give us a summary of the problem and tell us how to reproduce it.
|
||||
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
|
||||
placeholder: >
|
||||
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
|
||||
Here are the exact commands that I used: ...
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: first_bad_commit
|
||||
attributes:
|
||||
label: First Bad Commit
|
||||
description: >
|
||||
If the bug was not present on an earlier version: when did it start appearing?
|
||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: command
|
||||
attributes:
|
||||
label: Compile command
|
||||
description: >
|
||||
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
|
@ -1,101 +0,0 @@
|
|||
name: Bug (model use)
|
||||
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
|
||||
title: "Eval bug: "
|
||||
labels: ["bug-unconfirmed", "model evaluation"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
This issue template is intended for bug reports where the model evaluation results
|
||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
||||
If you encountered the issue while using an external UI (e.g. ollama),
|
||||
please reproduce your issue using one of the examples/binaries in this repository.
|
||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: Operating systems
|
||||
description: Which operating systems do you know to be affected?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: backends
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: hardware
|
||||
attributes:
|
||||
label: Hardware
|
||||
description: Which CPUs/GPUs are you using?
|
||||
placeholder: >
|
||||
e.g. Ryzen 5950X + 2x RTX 4090
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: model
|
||||
attributes:
|
||||
label: Models
|
||||
description: >
|
||||
Which model(s) at which quantization were you using when encountering the bug?
|
||||
If you downloaded a GGUF file off of Huggingface, please provide a link.
|
||||
placeholder: >
|
||||
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: info
|
||||
attributes:
|
||||
label: Problem description & steps to reproduce
|
||||
description: >
|
||||
Please give us a summary of the problem and tell us how to reproduce it.
|
||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
||||
that information would be very much appreciated by us.
|
||||
placeholder: >
|
||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
||||
When I use -ngl 0 it works correctly.
|
||||
Here are the exact commands that I used: ...
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: first_bad_commit
|
||||
attributes:
|
||||
label: First Bad Commit
|
||||
description: >
|
||||
If the bug was not present on an earlier version: when did it start appearing?
|
||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
91
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
91
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
|
@ -1,91 +0,0 @@
|
|||
name: Bug (misc.)
|
||||
description: Something is not working the way it should (and it's not covered by any of the above cases).
|
||||
title: "Misc. bug: "
|
||||
labels: ["bug-unconfirmed"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
This issue template is intended for miscellaneous bugs that don't fit into any other category.
|
||||
If you encountered the issue while using an external UI (e.g. ollama),
|
||||
please reproduce your issue using one of the examples/binaries in this repository.
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which version of our software is affected? (You can use `--version` to get a version string.)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: Operating systems
|
||||
description: Which operating systems do you know to be affected?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: dropdown
|
||||
id: module
|
||||
attributes:
|
||||
label: Which llama.cpp modules do you know to be affected?
|
||||
multiple: true
|
||||
options:
|
||||
- Documentation/Github
|
||||
- libllama (core library)
|
||||
- llama-cli
|
||||
- llama-server
|
||||
- llama-bench
|
||||
- llama-quantize
|
||||
- Python/Bash scripts
|
||||
- Test code
|
||||
- Other (Please specify in the next section)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: command
|
||||
attributes:
|
||||
label: Command line
|
||||
description: >
|
||||
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: info
|
||||
attributes:
|
||||
label: Problem description & steps to reproduce
|
||||
description: >
|
||||
Please give us a summary of the problem and tell us how to reproduce it (if applicable).
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: first_bad_commit
|
||||
attributes:
|
||||
label: First Bad Commit
|
||||
description: >
|
||||
If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
|
||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: Medium Severity Bug
|
||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "medium severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: High Severity Bug
|
||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "high severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: Critical Severity Bug
|
||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "critical severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
|
@ -1,5 +1,5 @@
|
|||
name: Enhancement
|
||||
description: Used to request enhancements for llama.cpp.
|
||||
description: Used to request enhancements for llama.cpp
|
||||
title: "Feature Request: "
|
||||
labels: ["enhancement"]
|
||||
body:
|
|
@ -1,5 +1,5 @@
|
|||
name: Research
|
||||
description: Track new technical research area.
|
||||
description: Track new technical research area
|
||||
title: "Research: "
|
||||
labels: ["research 🔬"]
|
||||
body:
|
|
@ -1,5 +1,5 @@
|
|||
name: Refactor (Maintainers)
|
||||
description: Used to track refactoring opportunities.
|
||||
description: Used to track refactoring opportunities
|
||||
title: "Refactor: "
|
||||
labels: ["refactor"]
|
||||
body:
|
2
.github/ISSUE_TEMPLATE/config.yml
vendored
2
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -9,3 +9,5 @@ contact_links:
|
|||
- name: Want to contribute?
|
||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||
|
||||
|
||||
|
|
5
.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
vendored
Normal file
5
.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
- Self Reported Review Complexity:
|
||||
- [ ] Review Complexity : Low
|
||||
- [ ] Review Complexity : Medium
|
||||
- [ ] Review Complexity : High
|
||||
- [ ] I have read the [contributing guidelines](CONTRIBUTING.md)
|
30
.github/labeler.yml
vendored
30
.github/labeler.yml
vendored
|
@ -2,32 +2,31 @@
|
|||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-kompute.h
|
||||
- ggml/src/ggml-kompute/**
|
||||
- ggml-kompute.h
|
||||
- ggml-kompute.cpp
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-metal.h
|
||||
- ggml/src/ggml-metal/**
|
||||
- ggml-metal.h
|
||||
- ggml-metal.cpp
|
||||
- README-metal.md
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-sycl.h
|
||||
- ggml/src/ggml-sycl/**
|
||||
- docs/backend/SYCL.md
|
||||
- examples/sycl/**
|
||||
- ggml-sycl.h
|
||||
- ggml-sycl.cpp
|
||||
- README-sycl.md
|
||||
Nvidia GPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-cuda.h
|
||||
- ggml/src/ggml-cuda/**
|
||||
- ggml-cuda.h
|
||||
- ggml-cuda/**
|
||||
Vulkan:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-vulkan.h
|
||||
- ggml/src/ggml-vulkan/**
|
||||
- ggml_vk_generate_shaders.py
|
||||
- ggml-vulkan*
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
@ -43,6 +42,7 @@ build:
|
|||
- cmake/**
|
||||
- CMakeLists.txt
|
||||
- CMakePresets.json
|
||||
- codecov.yml
|
||||
examples:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/**
|
||||
|
@ -74,7 +74,11 @@ server:
|
|||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/**
|
||||
- ggml.c
|
||||
- ggml.h
|
||||
- ggml-*.c
|
||||
- ggml-*.h
|
||||
- ggml-cuda/**
|
||||
nix:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
|
1
.github/pull_request_template.md
vendored
1
.github/pull_request_template.md
vendored
|
@ -1 +0,0 @@
|
|||
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
|
|
@ -1,6 +1,3 @@
|
|||
# TODO: there have been some issues with the workflow, so disabling for now
|
||||
# https://github.com/ggerganov/llama.cpp/issues/7893
|
||||
#
|
||||
# Benchmark
|
||||
name: Benchmark
|
||||
|
||||
|
@ -27,10 +24,10 @@ on:
|
|||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
schedule:
|
||||
- cron: '04 2 * * *'
|
||||
|
||||
|
@ -112,7 +109,7 @@ jobs:
|
|||
run: |
|
||||
set -eux
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DLLAMA_CUBLAS=ON \
|
||||
|
@ -122,7 +119,7 @@ jobs:
|
|||
-DLLAMA_FATAL_WARNINGS=OFF \
|
||||
-DLLAMA_ALL_WARNINGS=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release;
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
cmake --build build --config Release -j $(nproc) --target server
|
||||
|
||||
- name: Download the dataset
|
||||
id: download_dataset
|
||||
|
@ -132,8 +129,6 @@ jobs:
|
|||
|
||||
- name: Server bench
|
||||
id: server_bench
|
||||
env:
|
||||
HEAD_REF: ${{ github.head_ref || github.ref_name }}
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
|
@ -142,7 +137,7 @@ jobs:
|
|||
python bench.py \
|
||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
||||
--name ${{ github.job }} \
|
||||
--branch $HEAD_REF \
|
||||
--branch ${{ github.head_ref || github.ref_name }} \
|
||||
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
||||
--scenario script.js \
|
||||
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
877
.github/workflows/build.yml
vendored
877
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load diff
7
.github/workflows/close-issue.yml
vendored
7
.github/workflows/close-issue.yml
vendored
|
@ -3,11 +3,6 @@ on:
|
|||
schedule:
|
||||
- cron: "42 0 * * *"
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
issues: write
|
||||
|
||||
jobs:
|
||||
close-issues:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -17,7 +12,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
|
||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 14
|
||||
stale-issue-label: "stale"
|
||||
|
|
40
.github/workflows/code-coverage.yml
vendored
Normal file
40
.github/workflows/code-coverage.yml
vendored
Normal file
|
@ -0,0 +1,40 @@
|
|||
name: Code Coverage
|
||||
on: [push, pull_request]
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential gcc-8 lcov
|
||||
|
||||
- name: Build
|
||||
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
|
||||
|
||||
- name: Run tests
|
||||
run: CC=gcc-8 make test
|
||||
|
||||
- name: Generate coverage report
|
||||
run: |
|
||||
make coverage
|
||||
make lcov-report
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
with:
|
||||
files: lcov-report/coverage.info
|
164
.github/workflows/docker.yml
vendored
164
.github/workflows/docker.yml
vendored
|
@ -10,50 +10,49 @@
|
|||
name: Publish Docker image
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because it is expensive
|
||||
- cron: '12 4 * * *'
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
packages: write
|
||||
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||
# have disabled them for now until the reason why
|
||||
# is understood.
|
||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@v2
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
|
@ -62,45 +61,9 @@ jobs:
|
|||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
||||
REPO_NAME="${{ github.event.repository.name }}"
|
||||
|
||||
# determine tag name postfix (build number, commit hash)
|
||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
||||
TAG_POSTFIX="-b${BUILD_NUMBER}"
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
||||
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
||||
fi
|
||||
# list all tags possible
|
||||
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
||||
TYPE=""
|
||||
else
|
||||
TYPE="-${{ matrix.config.tag }}"
|
||||
fi
|
||||
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
||||
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
||||
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
||||
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
||||
echo "full_output_tags=$FULLTAGS" # print out for debugging
|
||||
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
||||
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
||||
env:
|
||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
if: ${{ matrix.config.free_disk_space == true }}
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
with:
|
||||
# this might remove tools that are actually needed,
|
||||
# if set to "true" but frees about 6 GB
|
||||
|
@ -115,59 +78,40 @@ jobs:
|
|||
docker-images: true
|
||||
swap-storage: true
|
||||
|
||||
- name: Build and push Full Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
# tag list is generated from step above
|
||||
tags: ${{ steps.tag.outputs.full_output_tags }}
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: full
|
||||
provenance: false
|
||||
# using github experimental cache
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
# return to this if the experimental github cache is having issues
|
||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Build and push Light Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
# tag list is generated from step above
|
||||
tags: ${{ steps.tag.outputs.light_output_tags }}
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: light
|
||||
provenance: false
|
||||
# using github experimental cache
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
# return to this if the experimental github cache is having issues
|
||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||
- name: Downcase github.repository_owner
|
||||
run: |
|
||||
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
|
||||
env:
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
- name: Build and push Server Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
||||
uses: docker/build-push-action@v6
|
||||
- name: Build and push Docker image (versioned)
|
||||
if: github.event_name == 'push'
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
# tag list is generated from step above
|
||||
tags: ${{ steps.tag.outputs.server_output_tags }}
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
- name: Build and push Docker image (tagged)
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: server
|
||||
provenance: false
|
||||
# using github experimental cache
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
# return to this if the experimental github cache is having issues
|
||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||
|
|
4
.github/workflows/editorconfig.yml
vendored
4
.github/workflows/editorconfig.yml
vendored
|
@ -23,7 +23,5 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
||||
with:
|
||||
version: v3.0.3
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@main
|
||||
- run: editorconfig-checker
|
||||
|
|
65
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
65
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
|
@ -0,0 +1,65 @@
|
|||
name: Nix aarch64 builds
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
||||
# 1.5h instead of minutes with the cold cache).
|
||||
#
|
||||
# randint(0, 59), randint(0, 23)
|
||||
- cron: '26 12 * * *'
|
||||
# But also rebuild if we touched any of the Nix expressions:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
nix-build-aarch64:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install QEMU
|
||||
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||
sudo usermod -a -G kvm $USER
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: llama-cpp
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.aarch64-linux"
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--systems aarch64-linux
|
||||
--flake
|
||||
".#checks.aarch64-linux"
|
72
.github/workflows/nix-ci.yml
vendored
Normal file
72
.github/workflows/nix-ci.yml
vendored
Normal file
|
@ -0,0 +1,72 @@
|
|||
name: Nix CI
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
nix-eval:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ ubuntu-latest, macos-latest ]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: List all flake outputs
|
||||
run: nix flake show --all-systems
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
-- --gc-roots-dir gcroot
|
||||
--flake
|
||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ ubuntu-latest, macos-latest ]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@v9
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
- name: Set-up cachix to push the results to
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: llama-cpp
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
-- --skip-cached --no-nom
|
||||
--flake
|
||||
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
|
@ -0,0 +1,22 @@
|
|||
name: update-flake-lock
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
||||
|
||||
jobs:
|
||||
lockfile:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Nix
|
||||
uses: DeterminateSystems/nix-installer-action@main
|
||||
- name: Update flake.lock
|
||||
uses: DeterminateSystems/update-flake-lock@main
|
||||
with:
|
||||
pr-title: "nix: update flake.lock"
|
||||
pr-labels: |
|
||||
nix
|
||||
pr-reviewers: philiptaron,SomeoneSerge
|
||||
token: ${{ secrets.FLAKE_TOKEN }}
|
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
||||
name: "Publish a flake to flakestry & flakehub"
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "The existing tag to publish"
|
||||
type: "string"
|
||||
required: true
|
||||
jobs:
|
||||
flakestry-publish:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: flakestry/flakestry-publish@main
|
||||
with:
|
||||
version: "${{ inputs.tag || github.ref_name }}"
|
||||
flakehub-publish:
|
||||
runs-on: "ubuntu-latest"
|
||||
permissions:
|
||||
id-token: "write"
|
||||
contents: "read"
|
||||
steps:
|
||||
- uses: "actions/checkout@v4"
|
||||
with:
|
||||
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
||||
- uses: "DeterminateSystems/nix-installer-action@main"
|
||||
- uses: "DeterminateSystems/flakehub-push@main"
|
||||
with:
|
||||
visibility: "public"
|
||||
tag: "${{ inputs.tag }}"
|
|
@ -6,13 +6,15 @@ on:
|
|||
- '.github/workflows/python-check-requirements.yml'
|
||||
- 'scripts/check-requirements.sh'
|
||||
- 'convert*.py'
|
||||
- '**/requirements*.txt'
|
||||
- 'requirements.txt'
|
||||
- 'requirements/*.txt'
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/workflows/python-check-requirements.yml'
|
||||
- 'scripts/check-requirements.sh'
|
||||
- 'convert*.py'
|
||||
- '**/requirements*.txt'
|
||||
- 'requirements.txt'
|
||||
- 'requirements/*.txt'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
|
|
9
.github/workflows/python-lint.yml
vendored
9
.github/workflows/python-lint.yml
vendored
|
@ -1,13 +1,6 @@
|
|||
name: flake8 Lint
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
|
|
40
.github/workflows/python-type-check.yml
vendored
40
.github/workflows/python-type-check.yml
vendored
|
@ -1,40 +0,0 @@
|
|||
name: Python Type-Check
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- '.github/workflows/python-type-check.yml'
|
||||
- 'pyrightconfig.json'
|
||||
- '**.py'
|
||||
- '**/requirements*.txt'
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/workflows/python-type-check.yml'
|
||||
- 'pyrightconfig.json'
|
||||
- '**.py'
|
||||
- '**/requirements*.txt'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
python-type-check:
|
||||
runs-on: ubuntu-latest
|
||||
name: pyright type-check
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Python environment
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install Python dependencies
|
||||
# TODO: use a venv
|
||||
run: pip install -r requirements/requirements-all.txt
|
||||
- name: Type-check with Pyright
|
||||
uses: jakebailey/pyright-action@v2
|
||||
with:
|
||||
version: 1.1.382
|
||||
level: warning
|
||||
warnings: true
|
110
.github/workflows/server.yml
vendored
110
.github/workflows/server.yml
vendored
|
@ -20,12 +20,6 @@ on:
|
|||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
@ -36,7 +30,7 @@ jobs:
|
|||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [RelWithDebInfo]
|
||||
include:
|
||||
- build_type: Release
|
||||
|
@ -76,108 +70,46 @@ jobs:
|
|||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
|
||||
# Setup nodejs (to be used for verifying bundled index.html)
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '22.11.0'
|
||||
|
||||
- name: WebUI - Install dependencies
|
||||
id: webui_lint
|
||||
run: |
|
||||
cd examples/server/webui
|
||||
npm ci
|
||||
|
||||
- name: WebUI - Check code format
|
||||
id: webui_format
|
||||
- name: Verify server deps
|
||||
id: verify_server_deps
|
||||
run: |
|
||||
git config --global --add safe.directory $(realpath .)
|
||||
cd examples/server/webui
|
||||
cd examples/server
|
||||
git ls-files --others --modified
|
||||
git status
|
||||
|
||||
npm run format
|
||||
./deps.sh
|
||||
git status
|
||||
modified_files="$(git status -s)"
|
||||
echo "Modified files: ${modified_files}"
|
||||
if [ -n "${modified_files}" ]; then
|
||||
echo "Files do not follow coding style. To fix: npm run format"
|
||||
echo "${modified_files}"
|
||||
not_ignored_files="$(git ls-files --others --modified)"
|
||||
echo "Modified files: ${not_ignored_files}"
|
||||
if [ -n "${not_ignored_files}" ]; then
|
||||
echo "Repository is dirty or server deps are not built as expected"
|
||||
echo "${not_ignored_files}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify bundled index.html
|
||||
id: verify_server_index_html
|
||||
run: |
|
||||
git config --global --add safe.directory $(realpath .)
|
||||
cd examples/server/webui
|
||||
git status
|
||||
|
||||
npm run build
|
||||
git status
|
||||
modified_files="$(git status -s)"
|
||||
echo "Modified files: ${modified_files}"
|
||||
if [ -n "${modified_files}" ]; then
|
||||
echo "Repository is dirty or server/webui is not built as expected"
|
||||
echo "Hint: You may need to follow Web UI build guide in server/README.md"
|
||||
echo "${modified_files}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_OPENMP=OFF ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build (sanitizers)
|
||||
id: cmake_build_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build (sanitizers)
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
./tests.sh
|
||||
|
||||
- name: Tests (sanitizers)
|
||||
id: server_integration_tests_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
LLAMA_SANITIZE=1 ./tests.sh
|
||||
PORT=8888 ./tests.sh
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
SLOW_TESTS=1 ./tests.sh
|
||||
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
||||
|
||||
|
||||
server-windows:
|
||||
|
@ -204,7 +136,7 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
|
@ -227,13 +159,11 @@ jobs:
|
|||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
pytest -v -x -m "not slow"
|
||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
$env:SLOW_TESTS = "1"
|
||||
pytest -v -x
|
||||
behave.exe --stop --no-skipped --no-capture --tags slow
|
||||
|
|
176
.gitignore
vendored
176
.gitignore
vendored
|
@ -1,145 +1,129 @@
|
|||
# Extensions
|
||||
|
||||
*.o
|
||||
*.a
|
||||
*.bat
|
||||
*.bin
|
||||
*.d
|
||||
*.dll
|
||||
*.dot
|
||||
*.etag
|
||||
*.exe
|
||||
*.gcda
|
||||
*.gcno
|
||||
*.gcov
|
||||
*.so
|
||||
*.gguf
|
||||
*.gguf.json
|
||||
*.lastModified
|
||||
*.bin
|
||||
*.exe
|
||||
*.dll
|
||||
*.log
|
||||
*.metallib
|
||||
*.o
|
||||
*.so
|
||||
*.swp
|
||||
*.gcov
|
||||
*.gcno
|
||||
*.gcda
|
||||
*.dot
|
||||
*.bat
|
||||
*.tmp
|
||||
|
||||
# IDE / OS
|
||||
|
||||
*.metallib
|
||||
*.etag
|
||||
*.lastModified
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
.ccls-cache/
|
||||
.direnv/
|
||||
.DS_Store
|
||||
.envrc
|
||||
.idea/
|
||||
.swiftpm
|
||||
.venv
|
||||
.clang-tidy
|
||||
.vs/
|
||||
.vscode/
|
||||
nppBackup
|
||||
.idea/
|
||||
|
||||
ggml-metal-embed.metal
|
||||
|
||||
# Coverage
|
||||
|
||||
gcovr-report/
|
||||
lcov-report/
|
||||
|
||||
# Build Artifacts
|
||||
gcovr-report/
|
||||
|
||||
tags
|
||||
.build/
|
||||
build*
|
||||
!build-info.cmake
|
||||
!build-info.cpp.in
|
||||
!build-info.sh
|
||||
!build.zig
|
||||
!docs/build.md
|
||||
/libllama.so
|
||||
/llama-*
|
||||
/vulkan-shaders-gen
|
||||
android-ndk-*
|
||||
arm_neon.h
|
||||
cmake-build-*
|
||||
CMakeSettings.json
|
||||
compile_commands.json
|
||||
ggml-metal-embed.metal
|
||||
llama-batched-swift
|
||||
/rpc-server
|
||||
android-ndk-*
|
||||
out/
|
||||
tmp/
|
||||
autogen-*.md
|
||||
|
||||
# Deprecated
|
||||
|
||||
/main
|
||||
/server
|
||||
|
||||
# CI
|
||||
|
||||
!.github/workflows/*.yml
|
||||
|
||||
# Models
|
||||
|
||||
models/*
|
||||
models-mnt
|
||||
!models/.editorconfig
|
||||
!models/ggml-vocab-*.gguf*
|
||||
|
||||
# Zig
|
||||
/Pipfile
|
||||
/baby-llama
|
||||
/beam-search
|
||||
/benchmark-matmult
|
||||
/convert-llama2c-to-ggml
|
||||
/embd-input-test
|
||||
/embedding
|
||||
/eval-callback
|
||||
/gguf
|
||||
/gguf-llama-simple
|
||||
/gguf-split
|
||||
/gritlm
|
||||
/imatrix
|
||||
/infill
|
||||
/libllama.so
|
||||
/llama-bench
|
||||
/llava-cli
|
||||
/lookahead
|
||||
/lookup
|
||||
/lookup-create
|
||||
/lookup-merge
|
||||
/lookup-stats
|
||||
/main
|
||||
/metal
|
||||
/passkey
|
||||
/perplexity
|
||||
/q8dot
|
||||
/quantize
|
||||
/quantize-stats
|
||||
/result
|
||||
/save-load-state
|
||||
/server
|
||||
/simple
|
||||
/batched
|
||||
/batched-bench
|
||||
/export-lora
|
||||
/finetune
|
||||
/retrieval
|
||||
/speculative
|
||||
/parallel
|
||||
/train-text-from-scratch
|
||||
/tokenize
|
||||
/vdot
|
||||
/common/build-info.cpp
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
CMakeSettings.json
|
||||
|
||||
__pycache__
|
||||
dist
|
||||
|
||||
zig-out/
|
||||
zig-cache/
|
||||
|
||||
# Logs
|
||||
|
||||
ppl-*.txt
|
||||
qnt-*.txt
|
||||
perf-*.txt
|
||||
|
||||
# Examples
|
||||
|
||||
examples/jeopardy/results.txt
|
||||
examples/server/*.css.hpp
|
||||
examples/server/*.html.hpp
|
||||
examples/server/*.js.hpp
|
||||
examples/server/*.mjs.hpp
|
||||
!build_64.sh
|
||||
!examples/*.bat
|
||||
!examples/*/*.kts
|
||||
!examples/*/*/*.kts
|
||||
!examples/sycl/*.bat
|
||||
!examples/sycl/*.sh
|
||||
examples/server/*.css.hpp
|
||||
|
||||
# Server Web UI temporary files
|
||||
node_modules
|
||||
examples/server/webui/dist
|
||||
|
||||
# Python
|
||||
|
||||
/.venv
|
||||
__pycache__/
|
||||
*/poetry.lock
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
|
||||
# Nix
|
||||
/result
|
||||
nppBackup
|
||||
|
||||
# Test binaries
|
||||
/tests/test-backend-ops
|
||||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
/tests/test-grammar-parser
|
||||
/tests/test-llama-grammar
|
||||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
/tests/test-opt
|
||||
/tests/test-quantize-fns
|
||||
/tests/test-quantize-perf
|
||||
/tests/test-rope
|
||||
/tests/test-sampling
|
||||
/tests/test-tokenizer-0
|
||||
/tests/test-tokenizer-1-bpe
|
||||
/tests/test-tokenizer-1-spm
|
||||
|
||||
# Scripts
|
||||
!/scripts/install-oneapi.bat
|
||||
|
||||
# Test models for lora adapters
|
||||
/lora-tests
|
||||
|
||||
# Local scripts
|
||||
/run-vim.sh
|
||||
/run-chat.sh
|
||||
/tests/test-tokenizer-1-bpe
|
||||
/tests/test-rope
|
||||
/tests/test-backend-ops
|
||||
|
|
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
|||
[submodule "kompute"]
|
||||
path = ggml/src/ggml-kompute/kompute
|
||||
path = kompute
|
||||
url = https://github.com/nomic-ai/kompute.git
|
||||
|
|
1372
CMakeLists.txt
1372
CMakeLists.txt
File diff suppressed because it is too large
Load diff
|
@ -11,37 +11,15 @@
|
|||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "sycl-base",
|
||||
"hidden": true,
|
||||
"generator": "Ninja",
|
||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||
"cacheVariables": {
|
||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||
"CMAKE_CXX_COMPILER": "icx",
|
||||
"CMAKE_C_COMPILER": "cl",
|
||||
"GGML_SYCL": "ON",
|
||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
},
|
||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
||||
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
|
||||
|
||||
{
|
||||
"name": "x64-windows-llvm", "hidden": true,
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
|
||||
}
|
||||
},
|
||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||
}
|
||||
|
@ -50,48 +28,22 @@
|
|||
{
|
||||
"name": "arm64-windows-llvm", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-apple-clang", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
||||
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
|
||||
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
||||
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
|
||||
|
||||
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
|
||||
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
||||
]
|
||||
}
|
||||
|
|
11
CODEOWNERS
11
CODEOWNERS
|
@ -1,11 +0,0 @@
|
|||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||
|
||||
/ci/ @ggerganov
|
||||
/.devops/*.Dockerfile @ngxson
|
||||
/examples/server/ @ngxson
|
||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/gguf.cpp @JohannesGaessler
|
131
CONTRIBUTING.md
131
CONTRIBUTING.md
|
@ -1,125 +1,14 @@
|
|||
# Pull requests (for contributors)
|
||||
# Contributing Guidelines
|
||||
|
||||
- Test your changes:
|
||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
||||
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
||||
## Checklist
|
||||
|
||||
# Pull requests (for collaborators)
|
||||
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
|
||||
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||
* Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||
|
||||
- Squash-merge PRs
|
||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
|
||||
## PR formatting
|
||||
|
||||
# Coding guidelines
|
||||
|
||||
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
||||
- Always consider cross-compatibility with other operating systems and architectures
|
||||
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
|
||||
- Vertical alignment makes things more readable and easier to batch edit
|
||||
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
||||
- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
|
||||
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
|
||||
- In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
|
||||
```cpp
|
||||
// OK
|
||||
llama_context * ctx;
|
||||
const llama_rope_type rope_type;
|
||||
|
||||
// not OK
|
||||
struct llama_context * ctx;
|
||||
const enum llama_rope_type rope_type;
|
||||
```
|
||||
|
||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
|
||||
|
||||
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
|
||||
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
|
||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||
|
||||

|
||||
|
||||
# Naming guidelines
|
||||
|
||||
- Use `snake_case` for function, variable and type names
|
||||
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
||||
|
||||
```cpp
|
||||
// not OK
|
||||
int small_number;
|
||||
int big_number;
|
||||
|
||||
// OK
|
||||
int number_small;
|
||||
int number_big;
|
||||
```
|
||||
|
||||
- Enum values are always in upper case and prefixed with the enum name
|
||||
|
||||
```cpp
|
||||
enum llama_vocab_type {
|
||||
LLAMA_VOCAB_TYPE_NONE = 0,
|
||||
LLAMA_VOCAB_TYPE_SPM = 1,
|
||||
LLAMA_VOCAB_TYPE_BPE = 2,
|
||||
LLAMA_VOCAB_TYPE_WPM = 3,
|
||||
LLAMA_VOCAB_TYPE_UGM = 4,
|
||||
LLAMA_VOCAB_TYPE_RWKV = 5,
|
||||
};
|
||||
```
|
||||
|
||||
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
|
||||
|
||||
```cpp
|
||||
llama_model_init(); // class: "llama_model", method: "init"
|
||||
llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
|
||||
llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed"
|
||||
llama_set_embeddings(); // class: "llama_context", method: "set_embeddings"
|
||||
llama_n_threads(); // class: "llama_context", method: "n_threads"
|
||||
llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free"
|
||||
```
|
||||
|
||||
- The `get` `<action>` can be omitted
|
||||
- The `<noun>` can be omitted if not necessary
|
||||
- The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
|
||||
- Use `init`/`free` for constructor/destructor `<action>`
|
||||
|
||||
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
|
||||
|
||||
```cpp
|
||||
typedef struct llama_context * llama_context_t;
|
||||
|
||||
enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
|
||||
```
|
||||
|
||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
|
||||
|
||||
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
|
||||
- Python filenames are all lowercase with underscores
|
||||
|
||||
- _(TODO: abbreviations usage)_
|
||||
|
||||
# Preprocessor directives
|
||||
|
||||
- _(TODO: add guidelines with examples and apply them to the codebase)_
|
||||
|
||||
```cpp
|
||||
#ifdef FOO
|
||||
#endif // FOO
|
||||
```
|
||||
|
||||
# Documentation
|
||||
|
||||
- Documentation is a community effort
|
||||
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
|
||||
- When you notice incorrect or outdated documentation, please update it
|
||||
|
||||
# Resources
|
||||
|
||||
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/projects
|
||||
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
|
||||
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
|
||||
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
|
||||
|
|
|
@ -2,6 +2,45 @@
|
|||
|
||||
import PackageDescription
|
||||
|
||||
var sources = [
|
||||
"ggml.c",
|
||||
"sgemm.cpp",
|
||||
"llama.cpp",
|
||||
"unicode.cpp",
|
||||
"unicode-data.cpp",
|
||||
"ggml-alloc.c",
|
||||
"ggml-backend.c",
|
||||
"ggml-quants.c",
|
||||
]
|
||||
|
||||
var resources: [Resource] = []
|
||||
var linkerSettings: [LinkerSetting] = []
|
||||
var cSettings: [CSetting] = [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||
.unsafeFlags(["-fno-objc-arc"]),
|
||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||
// .define("ACCELERATE_NEW_LAPACK"),
|
||||
// .define("ACCELERATE_LAPACK_ILP64")
|
||||
]
|
||||
|
||||
#if canImport(Darwin)
|
||||
sources.append("ggml-metal.m")
|
||||
resources.append(.process("ggml-metal.metal"))
|
||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||
cSettings.append(
|
||||
contentsOf: [
|
||||
.define("GGML_USE_ACCELERATE"),
|
||||
.define("GGML_USE_METAL")
|
||||
]
|
||||
)
|
||||
#endif
|
||||
|
||||
#if os(Linux)
|
||||
cSettings.append(.define("_GNU_SOURCE"))
|
||||
#endif
|
||||
|
||||
let package = Package(
|
||||
name: "llama",
|
||||
platforms: [
|
||||
|
@ -14,6 +53,26 @@ let package = Package(
|
|||
.library(name: "llama", targets: ["llama"]),
|
||||
],
|
||||
targets: [
|
||||
.systemLibrary(name: "llama", pkgConfig: "llama"),
|
||||
]
|
||||
.target(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"ggml-cuda.cu",
|
||||
"ggml-cuda.h",
|
||||
"Makefile"
|
||||
],
|
||||
sources: sources,
|
||||
resources: resources,
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: cSettings,
|
||||
linkerSettings: linkerSettings
|
||||
)
|
||||
],
|
||||
cxxLanguageStandard: .cxx11
|
||||
)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# llama.cpp for SYCL
|
||||
|
||||
- [Background](#background)
|
||||
- [Recommended Release](#recommended-release)
|
||||
- [News](#news)
|
||||
- [OS](#os)
|
||||
- [Hardware](#hardware)
|
||||
|
@ -20,38 +19,20 @@
|
|||
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
||||
|
||||
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
|
||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||
|
||||
### Llama.cpp + SYCL
|
||||
|
||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
|
||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
|
||||
|
||||
## Recommended Release
|
||||
|
||||
The SYCL backend would be broken by some PRs due to no online CI.
|
||||
|
||||
The following release is verified with good quality:
|
||||
|
||||
|Commit ID|Tag|Release|Verified Platform| Update date|
|
||||
|-|-|-|-|-|
|
||||
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
||||
|
||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||
|
||||
## News
|
||||
|
||||
- 2024.11
|
||||
- Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
|
||||
|
||||
- 2024.8
|
||||
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||
|
||||
- 2024.5
|
||||
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||
- Arch Linux is verified successfully.
|
||||
|
||||
- 2024.4
|
||||
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
|
||||
|
||||
|
@ -83,14 +64,7 @@ The following release is verified with good quality:
|
|||
|
||||
### Intel GPU
|
||||
|
||||
SYCL backend supports Intel GPU Family:
|
||||
|
||||
- Intel Data Center Max Series
|
||||
- Intel Flex Series, Arc Series
|
||||
- Intel Built-in Arc GPU
|
||||
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
|
||||
|
||||
#### Verified devices
|
||||
**Verified devices**
|
||||
|
||||
| Intel GPU | Status | Verified Model |
|
||||
|-------------------------------|---------|---------------------------------------|
|
||||
|
@ -98,12 +72,12 @@ SYCL backend supports Intel GPU Family:
|
|||
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
|
||||
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
||||
| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
|
||||
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
|
||||
|
||||
*Notes:*
|
||||
|
||||
- **Memory**
|
||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
|
||||
|
||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||
|
||||
|
@ -115,17 +89,9 @@ SYCL backend supports Intel GPU Family:
|
|||
**Verified devices**
|
||||
|
||||
| Nvidia GPU | Status | Verified Model |
|
||||
|--------------------------|-----------|----------------|
|
||||
| Ampere Series | Supported | A100, A4000 |
|
||||
| Ampere Series *(Mobile)* | Supported | RTX 40 Series |
|
||||
|
||||
| AMD GPU | Status | Verified Model |
|
||||
|--------------------------|--------------|----------------|
|
||||
| Radeon Pro | Experimental | W6800 |
|
||||
| Radeon RX | Experimental | 6700 XT |
|
||||
|
||||
Note: AMD GPU support is highly experimental and is incompatible with F16.
|
||||
Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
|
||||
|--------------------------|---------|----------------|
|
||||
| Ampere Series | Support | A100, A4000 |
|
||||
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
|
||||
|
||||
## Docker
|
||||
The docker build option is currently limited to *intel GPU* targets.
|
||||
|
@ -133,14 +99,14 @@ The docker build option is currently limited to *intel GPU* targets.
|
|||
### Build image
|
||||
```sh
|
||||
# Using FP16
|
||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
|
||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
||||
```
|
||||
|
||||
*Notes*:
|
||||
|
||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
|
||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
||||
|
||||
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||
|
||||
### Run container
|
||||
|
||||
|
@ -197,10 +163,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
|
|||
|
||||
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
|
||||
|
||||
- **AMD GPU**
|
||||
|
||||
To target AMD GPUs with SYCL, the ROCm stack must be installed first.
|
||||
|
||||
2. **Install Intel® oneAPI Base toolkit**
|
||||
|
||||
- **For Intel GPU**
|
||||
|
@ -211,7 +173,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
|
|||
|
||||
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
|
||||
|
||||
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
|
||||
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
|
||||
|
||||
- **Adding support to Nvidia GPUs**
|
||||
|
||||
|
@ -227,19 +189,6 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB
|
|||
cmake --build buildWithCublas --config Release
|
||||
```
|
||||
|
||||
- **Adding support to AMD GPUs**
|
||||
|
||||
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
|
||||
|
||||
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
|
||||
|
||||
```sh
|
||||
git clone https://github.com/oneapi-src/oneMKL
|
||||
cd oneMKL
|
||||
# Find your HIPTARGET with rocminfo, under the key 'Name:'
|
||||
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
|
||||
cmake --build buildWithrocBLAS --config Release
|
||||
```
|
||||
|
||||
3. **Verify installation and environment**
|
||||
|
||||
|
@ -251,60 +200,44 @@ sycl-ls
|
|||
|
||||
- **Intel GPU**
|
||||
|
||||
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
|
||||
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
|
||||
|
||||
```
|
||||
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
```
|
||||
|
||||
- **Nvidia GPU**
|
||||
|
||||
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
|
||||
|
||||
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
|
||||
```
|
||||
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
|
||||
[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
|
||||
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
|
||||
```
|
||||
|
||||
- **AMD GPU**
|
||||
|
||||
For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
|
||||
|
||||
```
|
||||
[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
|
||||
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
|
||||
[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
|
||||
```
|
||||
|
||||
### II. Build llama.cpp
|
||||
|
||||
#### Intel GPU
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```sh
|
||||
# Export relevant ENV variables
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# Option 2: Use FP16
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
|
||||
# build all binary
|
||||
cmake --build build --config Release -j -v
|
||||
```
|
||||
|
||||
#### Nvidia GPU
|
||||
|
||||
```sh
|
||||
# Export relevant ENV variables
|
||||
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
|
||||
|
@ -313,106 +246,62 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
|
|||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||
|
||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
|
||||
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# Option 2: Use FP16
|
||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
|
||||
# build all binary
|
||||
cmake --build build --config Release -j -v
|
||||
```
|
||||
|
||||
#### AMD GPU
|
||||
|
||||
```sh
|
||||
# Export relevant ENV variables
|
||||
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
|
||||
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
|
||||
|
||||
# Build LLAMA with rocBLAS acceleration through SYCL
|
||||
|
||||
## AMD
|
||||
# Use FP32, FP16 is not supported
|
||||
# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
|
||||
GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
|
||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# build all binary
|
||||
cmake --build build --config Release -j -v
|
||||
```
|
||||
|
||||
### III. Run the inference
|
||||
|
||||
#### Retrieve and prepare model
|
||||
1. Retrieve and prepare model
|
||||
|
||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
||||
|
||||
##### Check device
|
||||
|
||||
1. Enable oneAPI running environment
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```sh
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
2. List devices information
|
||||
3. List devices information
|
||||
|
||||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||
|
||||
```sh
|
||||
./build/bin/llama-ls-sycl-device
|
||||
./build/bin/ls-sycl-device
|
||||
```
|
||||
|
||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
||||
```
|
||||
found 2 SYCL devices:
|
||||
|
||||
found 6 SYCL devices:
|
||||
| | | |Compute |Max compute|Max work|Max sub| |
|
||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||
```
|
||||
|
||||
#### Choose level-zero devices
|
||||
| Attribute | Note |
|
||||
|------------------------|-------------------------------------------------------------|
|
||||
| compute capability 1.3 | Level-zero driver/runtime, recommended |
|
||||
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|
||||
|
||||
|Chosen Device ID|Setting|
|
||||
|-|-|
|
||||
|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action|
|
||||
|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
||||
|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
|
||||
|
||||
#### Execute
|
||||
|
||||
Choose one of following methods to run.
|
||||
|
||||
1. Script
|
||||
|
||||
- Use device 0:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run-llama2.sh 0
|
||||
```
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run-llama2.sh
|
||||
```
|
||||
|
||||
2. Command line
|
||||
Launch inference
|
||||
4. Launch inference
|
||||
|
||||
There are two device selection modes:
|
||||
|
||||
- Single device: Use one device assigned by user. Default device id is 0.
|
||||
- Multiple devices: Automatically choose the devices with the same backend.
|
||||
|
||||
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
|
||||
- Single device: Use one device target specified by the user.
|
||||
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|
||||
|
||||
| Device selection | Parameter |
|
||||
|------------------|----------------------------------------|
|
||||
|
@ -424,13 +313,24 @@ Examples:
|
|||
- Use device 0:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run_llama2.sh 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
```
|
||||
|
||||
Otherwise, you can run the script:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run_llama2.sh
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
|
@ -479,7 +379,7 @@ c. Verify installation
|
|||
In the oneAPI command line, run the following to print the available SYCL devices:
|
||||
|
||||
```
|
||||
sycl-ls.exe
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
|
||||
|
@ -494,120 +394,89 @@ Output (example):
|
|||
|
||||
4. Install build tools
|
||||
|
||||
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
||||
|
||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
||||
|
||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
||||
|
||||
- Extract `w64devkit` on your pc.
|
||||
|
||||
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
|
||||
|
||||
### II. Build llama.cpp
|
||||
|
||||
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
|
||||
|
||||
Choose one of following methods to build from source code.
|
||||
|
||||
1. Script
|
||||
|
||||
```sh
|
||||
.\examples\sycl\win-build-sycl.bat
|
||||
```
|
||||
|
||||
2. CMake
|
||||
|
||||
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
|
||||
|
||||
```
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
# Option 2: Or FP16
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
cmake --build build --config Release -j
|
||||
```
|
||||
|
||||
Or, use CMake presets to build:
|
||||
|
||||
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
|
||||
```sh
|
||||
cmake --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||
|
||||
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||
|
||||
cmake --preset x64-windows-sycl-debug
|
||||
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||
.\examples\sycl\win-build-sycl.bat
|
||||
```
|
||||
|
||||
3. Visual Studio
|
||||
|
||||
You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||
|
||||
*Notes:*
|
||||
|
||||
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
|
||||
|
||||
### III. Run the inference
|
||||
|
||||
#### Retrieve and prepare model
|
||||
1. Retrieve and prepare model
|
||||
|
||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
||||
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
||||
|
||||
##### Check device
|
||||
|
||||
1. Enable oneAPI running environment
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
On the oneAPI command line window, run the following and step into the llama.cpp directory:
|
||||
```
|
||||
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||
```
|
||||
|
||||
2. List devices information
|
||||
3. List devices information
|
||||
|
||||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||
|
||||
```
|
||||
build\bin\llama-ls-sycl-device.exe
|
||||
build\bin\ls-sycl-device.exe
|
||||
```
|
||||
|
||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
|
||||
```
|
||||
found 2 SYCL devices:
|
||||
found 6 SYCL devices:
|
||||
| | | |Compute |Max compute|Max work|Max sub| |
|
||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||
|
||||
```
|
||||
#### Choose level-zero devices
|
||||
|
||||
|Chosen Device ID|Setting|
|
||||
|-|-|
|
||||
|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|
||||
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
||||
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
|
||||
| Attribute | Note |
|
||||
|------------------------|-----------------------------------------------------------|
|
||||
| compute capability 1.3 | Level-zero running time, recommended |
|
||||
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|
||||
|
||||
#### Execute
|
||||
|
||||
Choose one of following methods to run.
|
||||
|
||||
1. Script
|
||||
|
||||
```
|
||||
examples\sycl\win-run-llama2.bat
|
||||
```
|
||||
|
||||
2. Command line
|
||||
|
||||
Launch inference
|
||||
4. Launch inference
|
||||
|
||||
There are two device selection modes:
|
||||
|
||||
- Single device: Use one device assigned by user. Default device id is 0.
|
||||
- Multiple devices: Automatically choose the devices with the same backend.
|
||||
|
||||
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
|
||||
- Single device: Use one device assigned by user.
|
||||
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
||||
|
||||
| Device selection | Parameter |
|
||||
|------------------|----------------------------------------|
|
||||
|
@ -619,15 +488,19 @@ Examples:
|
|||
- Use device 0:
|
||||
|
||||
```
|
||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```
|
||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
```
|
||||
Otherwise, run the following wrapper script:
|
||||
|
||||
```
|
||||
.\examples\sycl\win-run-llama2.bat
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
|
@ -641,19 +514,17 @@ Or
|
|||
use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
| Name | Value | Function |
|
||||
|--------------------|---------------------------------------|---------------------------------------------|
|
||||
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
|
||||
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
||||
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|--------------------|-----------------------------------|---------------------------------------------|
|
||||
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
||||
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
||||
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|
||||
#### Runtime
|
||||
|
||||
|
@ -689,26 +560,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||
```
|
||||
Otherwise, please double-check the GPU driver installation steps.
|
||||
|
||||
- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
|
||||
|
||||
No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
|
||||
|
||||
Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
|
||||
|
||||
It's same for other projects including llama.cpp SYCL backend.
|
||||
|
||||
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
|
||||
|
||||
Device Memory is not enough.
|
||||
|
||||
|Reason|Solution|
|
||||
|-|-|
|
||||
|Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
|
||||
|Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
|
||||
|
||||
### **GitHub contribution**:
|
||||
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
|
||||
|
||||
## TODO
|
||||
|
||||
- NA
|
||||
- Support row layer split for multiple card runs.
|
|
@ -1,4 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <llama.h>
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
module llama [system] {
|
||||
header "llama.h"
|
||||
link "llama"
|
||||
export *
|
||||
}
|
363
ci/run.sh
363
ci/run.sh
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#/bin/bash
|
||||
#
|
||||
# sample usage:
|
||||
#
|
||||
|
@ -13,9 +13,6 @@
|
|||
# # with SYCL support
|
||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with VULKAN support
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
|
@ -39,11 +36,11 @@ SRC=`pwd`
|
|||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
|
@ -53,11 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
||||
fi
|
||||
## helpers
|
||||
|
||||
|
@ -110,11 +103,8 @@ function gg_run_ctest_debug {
|
|||
|
||||
set -e
|
||||
|
||||
# Check cmake, make and ctest are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
|
@ -141,11 +131,8 @@ function gg_run_ctest_release {
|
|||
|
||||
set -e
|
||||
|
||||
# Check cmake, make and ctest are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
@ -273,6 +260,7 @@ function gg_sum_ctest_with_model_release {
|
|||
}
|
||||
|
||||
# open_llama_7b_v2
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
function gg_run_open_llama_7b_v2 {
|
||||
cd ${SRC}
|
||||
|
@ -296,10 +284,10 @@ function gg_run_open_llama_7b_v2 {
|
|||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
@ -315,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
|
|||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -431,9 +419,9 @@ function gg_run_pythia_1_4b {
|
|||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
@ -449,45 +437,45 @@ function gg_run_pythia_1_4b {
|
|||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -541,6 +529,7 @@ function gg_sum_pythia_1_4b {
|
|||
}
|
||||
|
||||
# pythia_2_8b
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
function gg_run_pythia_2_8b {
|
||||
cd ${SRC}
|
||||
|
@ -561,10 +550,10 @@ function gg_run_pythia_2_8b {
|
|||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
@ -580,47 +569,47 @@ function gg_run_pythia_2_8b {
|
|||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -697,17 +686,17 @@ function gg_run_embd_bge_small {
|
|||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
@ -721,92 +710,8 @@ function gg_sum_embd_bge_small {
|
|||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
}
|
||||
|
||||
# rerank_tiny
|
||||
|
||||
function gg_run_rerank_tiny {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
|
||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
|
||||
|
||||
gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
|
||||
|
||||
path_models="../models-mnt/rerank-tiny"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
|
||||
# for this model, the SEP token is "</s>"
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
|
||||
# sample output
|
||||
# rerank score 0: 0.029
|
||||
# rerank score 1: 0.029
|
||||
# rerank score 2: 0.135
|
||||
|
||||
# check that the score is in the range [$3, $4]
|
||||
function check_score {
|
||||
qnt="$1"
|
||||
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$score"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
||||
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
||||
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_rerank_tiny {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Rerank Tiny (Jina):\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
|
||||
}
|
||||
|
||||
function gg_check_build_requirements {
|
||||
if ! command -v cmake &> /dev/null; then
|
||||
gg_printf 'cmake not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v make &> /dev/null; then
|
||||
gg_printf 'make not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v ctest &> /dev/null; then
|
||||
gg_printf 'ctest not found, please install'
|
||||
fi
|
||||
}
|
||||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
export LLAMA_LOG_TIMESTAMPS=1
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||
rm -rf ${SRC}/models-mnt
|
||||
|
@ -815,10 +720,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||
|
||||
# Create a fresh python3 venv and enter it
|
||||
if ! python3 -m venv "$MNT/venv"; then
|
||||
echo "Error: Failed to create Python virtual environment at $MNT/venv."
|
||||
exit 1
|
||||
fi
|
||||
python3 -m venv "$MNT/venv"
|
||||
source "$MNT/venv/bin/activate"
|
||||
|
||||
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
||||
|
@ -832,7 +734,6 @@ test $ret -eq 0 && gg_run ctest_release
|
|||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run embd_bge_small
|
||||
test $ret -eq 0 && gg_run rerank_tiny
|
||||
|
||||
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
||||
test $ret -eq 0 && gg_run test_scripts_debug
|
||||
|
@ -840,7 +741,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|||
fi
|
||||
|
||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||
test $ret -eq 0 && gg_run pythia_1_4b
|
||||
else
|
||||
test $ret -eq 0 && gg_run pythia_2_8b
|
||||
|
|
|
@ -79,22 +79,22 @@ endmacro()
|
|||
# flags are for MSVC only!
|
||||
check_sse("AVX" " ;/arch:AVX")
|
||||
if (NOT ${AVX_FOUND})
|
||||
set(GGML_AVX OFF)
|
||||
set(LLAMA_AVX OFF)
|
||||
else()
|
||||
set(GGML_AVX ON)
|
||||
set(LLAMA_AVX ON)
|
||||
endif()
|
||||
|
||||
check_sse("AVX2" " ;/arch:AVX2")
|
||||
check_sse("FMA" " ;/arch:AVX2")
|
||||
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||
set(GGML_AVX2 OFF)
|
||||
set(LLAMA_AVX2 OFF)
|
||||
else()
|
||||
set(GGML_AVX2 ON)
|
||||
set(LLAMA_AVX2 ON)
|
||||
endif()
|
||||
|
||||
check_sse("AVX512" " ;/arch:AVX512")
|
||||
if (NOT ${AVX512_FOUND})
|
||||
set(GGML_AVX512 OFF)
|
||||
set(LLAMA_AVX512 OFF)
|
||||
else()
|
||||
set(GGML_AVX512 ON)
|
||||
set(LLAMA_AVX512 ON)
|
||||
endif()
|
|
@ -1,16 +0,0 @@
|
|||
set( CMAKE_SYSTEM_NAME Darwin )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-apple-darwin-macho )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
@ -1,33 +0,0 @@
|
|||
function(llama_add_compile_flags)
|
||||
if (LLAMA_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND C_FLAGS -Werror)
|
||||
list(APPEND CXX_FLAGS -Werror)
|
||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
add_compile_options(/WX)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||
|
||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||
|
||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||
|
||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
||||
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
||||
|
||||
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
||||
else()
|
||||
# todo : msvc
|
||||
set(C_FLAGS "" PARENT_SCOPE)
|
||||
set(CXX_FLAGS "" PARENT_SCOPE)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
|
@ -1,22 +0,0 @@
|
|||
find_package(Git)
|
||||
|
||||
# the commit's SHA1
|
||||
execute_process(COMMAND
|
||||
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||
OUTPUT_VARIABLE GIT_SHA1
|
||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
# the date of the commit
|
||||
execute_process(COMMAND
|
||||
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||
OUTPUT_VARIABLE GIT_DATE
|
||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
# the subject of the commit
|
||||
execute_process(COMMAND
|
||||
"${GIT_EXECUTABLE}" log -1 --format=%s
|
||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
@ -1,30 +0,0 @@
|
|||
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
|
||||
|
||||
find_library(llama_LIBRARY llama
|
||||
REQUIRED
|
||||
HINTS ${LLAMA_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH
|
||||
)
|
||||
|
||||
add_library(llama UNKNOWN IMPORTED)
|
||||
set_target_properties(llama
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
check_required_components(Llama)
|
|
@ -1,10 +1,10 @@
|
|||
prefix=@CMAKE_INSTALL_PREFIX@
|
||||
exec_prefix=@CMAKE_INSTALL_PREFIX@
|
||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
exec_prefix=${prefix}
|
||||
libdir=${exec_prefix}/lib
|
||||
includedir=${prefix}/include
|
||||
|
||||
Name: llama
|
||||
Description: Port of Facebook's LLaMA model in C/C++
|
||||
Version: @LLAMA_INSTALL_VERSION@
|
||||
Libs: -L${libdir} -lggml -lggml-base -lllama
|
||||
Version: @PROJECT_VERSION@
|
||||
Libs: -L${libdir} -lllama
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( arch_c_flags "-march=native" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
|
||||
|
14
codecov.yml
Normal file
14
codecov.yml
Normal file
|
@ -0,0 +1,14 @@
|
|||
comment: off
|
||||
|
||||
coverage:
|
||||
status:
|
||||
project:
|
||||
default:
|
||||
target: auto
|
||||
threshold: 0
|
||||
base: auto
|
||||
patch:
|
||||
default:
|
||||
target: auto
|
||||
threshold: 0
|
||||
base: auto
|
|
@ -1,8 +1,5 @@
|
|||
# common
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
llama_add_compile_flags()
|
||||
|
||||
# Build info header
|
||||
#
|
||||
|
@ -39,7 +36,7 @@ add_custom_command(
|
|||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
|
@ -53,28 +50,21 @@ endif()
|
|||
set(TARGET common)
|
||||
|
||||
add_library(${TARGET} STATIC
|
||||
arg.cpp
|
||||
arg.h
|
||||
base64.hpp
|
||||
chat.cpp
|
||||
chat.hpp
|
||||
chat-template.hpp
|
||||
common.cpp
|
||||
common.h
|
||||
console.cpp
|
||||
console.h
|
||||
json-schema-to-grammar.cpp
|
||||
json.hpp
|
||||
llguidance.cpp
|
||||
log.cpp
|
||||
log.h
|
||||
minja.hpp
|
||||
ngram-cache.cpp
|
||||
ngram-cache.h
|
||||
sampling.cpp
|
||||
common.cpp
|
||||
sampling.h
|
||||
speculative.cpp
|
||||
speculative.h
|
||||
sampling.cpp
|
||||
console.h
|
||||
console.cpp
|
||||
grammar-parser.h
|
||||
grammar-parser.cpp
|
||||
json.hpp
|
||||
json-schema-to-grammar.cpp
|
||||
train.h
|
||||
train.cpp
|
||||
ngram-cache.h
|
||||
ngram-cache.cpp
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
@ -86,39 +76,12 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
|||
# Use curl to download model url
|
||||
if (LLAMA_CURL)
|
||||
find_package(CURL REQUIRED)
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
||||
add_definitions(-DLLAMA_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
find_library(CURL_LIBRARY curl REQUIRED)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||
endif ()
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
include(ExternalProject)
|
||||
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
|
||||
set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
|
||||
ExternalProject_Add(llguidance_ext
|
||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
||||
# v0.6.12:
|
||||
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
|
||||
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
||||
SOURCE_DIR ${LLGUIDANCE_SRC}
|
||||
BUILD_IN_SOURCE TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND cargo build --release
|
||||
INSTALL_COMMAND ""
|
||||
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
|
||||
UPDATE_COMMAND ""
|
||||
)
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
|
||||
|
||||
add_library(llguidance STATIC IMPORTED)
|
||||
set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
|
||||
add_dependencies(llguidance llguidance_ext)
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
|
||||
endif ()
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
|
|
2370
common/arg.cpp
2370
common/arg.cpp
File diff suppressed because it is too large
Load diff
80
common/arg.h
80
common/arg.h
|
@ -1,80 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
struct common_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::set<enum llama_example> excludes = {};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
bool is_sparam = false; // is current arg a sampling param?
|
||||
void (*handler_void) (common_params & params) = nullptr;
|
||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||
void (*handler_int) (common_params & params, int) = nullptr;
|
||||
|
||||
common_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(common_params & params, const std::string &)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||
|
||||
common_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(common_params & params, int)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||
|
||||
common_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const std::string & help,
|
||||
void (*handler)(common_params & params)
|
||||
) : args(args), help(help), handler_void(handler) {}
|
||||
|
||||
// support 2 values for arg
|
||||
common_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const char * value_hint_2,
|
||||
const std::string & help,
|
||||
void (*handler)(common_params & params, const std::string &, const std::string &)
|
||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||
common_arg & set_env(const char * env);
|
||||
common_arg & set_sparam();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool is_exclude(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output);
|
||||
bool has_value_from_env();
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
struct common_params_context {
|
||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||
common_params & params;
|
||||
std::vector<common_arg> options;
|
||||
void(*print_usage)(int, char **) = nullptr;
|
||||
common_params_context(common_params & params) : params(params) {}
|
||||
};
|
||||
|
||||
// parse input arguments from CLI
|
||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
// function to be used by test-arg-parser
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
@ -1,529 +0,0 @@
|
|||
/*
|
||||
Copyright 2024 Google LLC
|
||||
|
||||
Use of this source code is governed by an MIT-style
|
||||
license that can be found in the LICENSE file or at
|
||||
https://opensource.org/licenses/MIT.
|
||||
*/
|
||||
// SPDX-License-Identifier: MIT
|
||||
#pragma once
|
||||
|
||||
#include "minja.hpp"
|
||||
#include <json.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
namespace minja {
|
||||
|
||||
struct chat_template_caps {
|
||||
bool supports_tools = false;
|
||||
bool supports_tool_calls = false;
|
||||
bool supports_tool_responses = false;
|
||||
bool supports_system_role = false;
|
||||
bool supports_parallel_tool_calls = false;
|
||||
bool supports_tool_call_id = false;
|
||||
// meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object.
|
||||
// Most other templates (and OpenAI's API) expect the arguments object to be stringified.
|
||||
bool requires_object_arguments = false;
|
||||
// CohereForAI/c4ai-command-r-plus simple variant
|
||||
bool requires_non_null_content = false;
|
||||
// MiniMaxAI/MiniMax-Text-01 special
|
||||
bool requires_typed_content = false;
|
||||
};
|
||||
|
||||
struct chat_template_inputs {
|
||||
nlohmann::ordered_json messages;
|
||||
nlohmann::ordered_json tools;
|
||||
bool add_generation_prompt = true;
|
||||
nlohmann::ordered_json extra_context;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
};
|
||||
|
||||
struct chat_template_options {
|
||||
bool apply_polyfills = true;
|
||||
bool use_bos_token = true;
|
||||
bool use_eos_token = true;
|
||||
bool define_strftime_now = true;
|
||||
|
||||
bool polyfill_tools = true;
|
||||
bool polyfill_tool_call_examples = true;
|
||||
bool polyfill_tool_calls = true;
|
||||
bool polyfill_tool_responses = true;
|
||||
bool polyfill_system_role = true;
|
||||
bool polyfill_object_arguments = true;
|
||||
bool polyfill_typed_content = true;
|
||||
};
|
||||
|
||||
class chat_template {
|
||||
|
||||
private:
|
||||
chat_template_caps caps_;
|
||||
std::string source_;
|
||||
std::string bos_token_;
|
||||
std::string eos_token_;
|
||||
std::shared_ptr<minja::TemplateNode> template_root_;
|
||||
std::string tool_call_example_;
|
||||
|
||||
std::string try_raw_render(
|
||||
const nlohmann::ordered_json & messages,
|
||||
const nlohmann::ordered_json & tools,
|
||||
bool add_generation_prompt,
|
||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
|
||||
{
|
||||
try {
|
||||
chat_template_inputs inputs;
|
||||
inputs.messages = messages;
|
||||
inputs.tools = tools;
|
||||
inputs.add_generation_prompt = add_generation_prompt;
|
||||
inputs.extra_context = extra_context;
|
||||
// Use fixed date for tests
|
||||
inputs.now = std::chrono::system_clock::from_time_t(0);
|
||||
|
||||
chat_template_options opts;
|
||||
opts.apply_polyfills = false;
|
||||
|
||||
auto prompt = apply(inputs, opts);
|
||||
// fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
|
||||
return prompt;
|
||||
} catch (const std::exception & e) {
|
||||
// fprintf(stderr, "try_raw_render error: %s\n", e.what());
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
|
||||
: source_(source), bos_token_(bos_token), eos_token_(eos_token)
|
||||
{
|
||||
template_root_ = minja::Parser::parse(source_, {
|
||||
/* .trim_blocks = */ true,
|
||||
/* .lstrip_blocks = */ true,
|
||||
/* .keep_trailing_newline = */ false,
|
||||
});
|
||||
|
||||
auto contains = [](const std::string & haystack, const std::string & needle) {
|
||||
return haystack.find(needle) != std::string::npos;
|
||||
};
|
||||
|
||||
const std::string user_needle = "<User Needle>";
|
||||
const std::string sys_needle = "<System Needle>";
|
||||
const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}};
|
||||
const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}};
|
||||
|
||||
caps_.requires_typed_content =
|
||||
!contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle)
|
||||
&& contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle);
|
||||
|
||||
const auto dummy_user_msg = caps_.requires_typed_content
|
||||
? dummy_typed_user_msg
|
||||
: dummy_str_user_msg;
|
||||
const json needle_system_msg = {
|
||||
{"role", "system"},
|
||||
{"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)},
|
||||
};
|
||||
|
||||
caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle);
|
||||
|
||||
auto out = try_raw_render(json::array({
|
||||
dummy_user_msg
|
||||
}), json::array({
|
||||
{
|
||||
{"name", "some_tool"},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", "some_tool"},
|
||||
{"description", "Some tool."},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"arg", {
|
||||
{"type", "string"},
|
||||
{"description", "Some argument."},
|
||||
}},
|
||||
}},
|
||||
{"required", json::array({ "arg" })},
|
||||
}},
|
||||
}},
|
||||
},
|
||||
}), false);
|
||||
caps_.supports_tools = contains(out, "some_tool");
|
||||
|
||||
auto make_tool_calls_msg = [&](const json & tool_calls) {
|
||||
return json {
|
||||
{"role", "assistant"},
|
||||
{"content", nullptr},
|
||||
{"tool_calls", tool_calls},
|
||||
};
|
||||
};
|
||||
auto make_tool_call = [](const std::string & tool_name, const json & arguments) {
|
||||
return json {
|
||||
{"id", "call_1___"},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"arguments", arguments},
|
||||
{"name", tool_name},
|
||||
}},
|
||||
};
|
||||
};
|
||||
const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}};
|
||||
|
||||
// Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want.
|
||||
out = try_raw_render(json::array({
|
||||
dummy_user_msg,
|
||||
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
|
||||
}), {}, false);
|
||||
auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
|
||||
out = try_raw_render(json::array({
|
||||
dummy_user_msg,
|
||||
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
|
||||
}), {}, false);
|
||||
auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
|
||||
|
||||
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
|
||||
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
|
||||
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
|
||||
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
|
||||
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
|
||||
|
||||
if (caps_.supports_tool_calls) {
|
||||
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
|
||||
auto tc1 = make_tool_call("test_tool1", dummy_args);
|
||||
auto tc2 = make_tool_call("test_tool2", dummy_args);
|
||||
auto out = try_raw_render(json::array({
|
||||
dummy_user_msg,
|
||||
make_tool_calls_msg(json::array({tc1, tc2})),
|
||||
}), {}, false);
|
||||
caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2");
|
||||
|
||||
out = try_raw_render(json::array({
|
||||
dummy_user_msg,
|
||||
make_tool_calls_msg(json::array({tc1})),
|
||||
{
|
||||
{"role", "tool"},
|
||||
{"name", "test_tool1"},
|
||||
{"content", "Some response!"},
|
||||
{"tool_call_id", "call_911_"},
|
||||
}
|
||||
}), {}, false);
|
||||
caps_.supports_tool_responses = contains(out, "Some response!");
|
||||
caps_.supports_tool_call_id = contains(out, "call_911_");
|
||||
}
|
||||
|
||||
try {
|
||||
if (!caps_.supports_tools) {
|
||||
const json user_msg {
|
||||
{"role", "user"},
|
||||
{"content", "Hey"},
|
||||
};
|
||||
const json args {
|
||||
{"arg1", "some_value"},
|
||||
};
|
||||
const json tool_call_msg {
|
||||
{"role", "assistant"},
|
||||
{"content", nullptr},
|
||||
{"tool_calls", json::array({
|
||||
{
|
||||
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
|
||||
{"id", "call_1___"},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", "tool_name"},
|
||||
{"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))},
|
||||
}},
|
||||
},
|
||||
})},
|
||||
};
|
||||
std::string prefix, full;
|
||||
{
|
||||
chat_template_inputs inputs;
|
||||
inputs.messages = json::array({user_msg});
|
||||
inputs.add_generation_prompt = true;
|
||||
prefix = apply(inputs);
|
||||
}
|
||||
{
|
||||
chat_template_inputs inputs;
|
||||
inputs.messages = json::array({user_msg, tool_call_msg});
|
||||
inputs.add_generation_prompt = false;
|
||||
full = apply(inputs);
|
||||
}
|
||||
auto eos_pos_last = full.rfind(eos_token_);
|
||||
if (eos_pos_last == prefix.size() - eos_token_.size() ||
|
||||
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
|
||||
full = full.substr(0, eos_pos_last);
|
||||
}
|
||||
size_t common_prefix_length = 0;
|
||||
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
|
||||
if (prefix[i] != full[i]) {
|
||||
break;
|
||||
}
|
||||
if (prefix[i] == '<') {
|
||||
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
|
||||
// but it removes thinking tags for past messages.
|
||||
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
|
||||
continue;
|
||||
}
|
||||
common_prefix_length = i + 1;
|
||||
}
|
||||
auto example = full.substr(common_prefix_length);
|
||||
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
|
||||
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
|
||||
} else {
|
||||
tool_call_example_ = example;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
const std::string & source() const { return source_; }
|
||||
const std::string & bos_token() const { return bos_token_; }
|
||||
const std::string & eos_token() const { return eos_token_; }
|
||||
const chat_template_caps & original_caps() const { return caps_; }
|
||||
|
||||
// Deprecated, please use the form with chat_template_inputs and chat_template_options
|
||||
std::string apply(
|
||||
const nlohmann::ordered_json & messages,
|
||||
const nlohmann::ordered_json & tools,
|
||||
bool add_generation_prompt,
|
||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
|
||||
bool apply_polyfills = true)
|
||||
{
|
||||
fprintf(stderr, "[%s] Deprecated!\n", __func__);
|
||||
chat_template_inputs inputs;
|
||||
inputs.messages = messages;
|
||||
inputs.tools = tools;
|
||||
inputs.add_generation_prompt = add_generation_prompt;
|
||||
inputs.extra_context = extra_context;
|
||||
inputs.now = std::chrono::system_clock::now();
|
||||
|
||||
chat_template_options opts;
|
||||
opts.apply_polyfills = apply_polyfills;
|
||||
|
||||
return apply(inputs, opts);
|
||||
}
|
||||
|
||||
std::string apply(
|
||||
const chat_template_inputs & inputs,
|
||||
const chat_template_options & opts = chat_template_options()) const
|
||||
{
|
||||
json actual_messages;
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto has_tool_calls = false;
|
||||
auto has_tool_responses = false;
|
||||
auto has_string_content = false;
|
||||
for (const auto & message : inputs.messages) {
|
||||
if (message.contains("tool_calls") && !message["tool_calls"].is_null()) {
|
||||
has_tool_calls = true;
|
||||
}
|
||||
if (message.contains("role") && message["role"] == "tool") {
|
||||
has_tool_responses = true;
|
||||
}
|
||||
if (message.contains("content") && message["content"].is_string()) {
|
||||
has_string_content = true;
|
||||
}
|
||||
}
|
||||
|
||||
auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
|
||||
auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
|
||||
auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
|
||||
auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
|
||||
auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
|
||||
auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
|
||||
auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content;
|
||||
|
||||
auto needs_polyfills = opts.apply_polyfills && (false
|
||||
|| polyfill_system_role
|
||||
|| polyfill_tools
|
||||
|| polyfill_tool_calls
|
||||
|| polyfill_tool_responses
|
||||
|| polyfill_object_arguments
|
||||
|| polyfill_typed_content
|
||||
);
|
||||
|
||||
if (needs_polyfills) {
|
||||
actual_messages = json::array();
|
||||
|
||||
auto add_message = [&](const json & msg) {
|
||||
if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
|
||||
actual_messages.push_back({
|
||||
{"role", msg.at("role")},
|
||||
{"content", {{
|
||||
{"type", "text"},
|
||||
{"text", msg.at("content")},
|
||||
}}},
|
||||
});
|
||||
} else {
|
||||
actual_messages.push_back(msg);
|
||||
}
|
||||
};
|
||||
|
||||
std::string pending_system;
|
||||
auto flush_sys = [&]() {
|
||||
if (!pending_system.empty()) {
|
||||
add_message({
|
||||
{"role", "user"},
|
||||
{"content", pending_system},
|
||||
});
|
||||
pending_system.clear();
|
||||
}
|
||||
};
|
||||
|
||||
json adjusted_messages;
|
||||
if (polyfill_tools) {
|
||||
adjusted_messages = add_system(inputs.messages,
|
||||
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
|
||||
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
|
||||
} else {
|
||||
adjusted_messages = inputs.messages;
|
||||
}
|
||||
|
||||
for (const auto & message_ : adjusted_messages) {
|
||||
auto message = message_;
|
||||
if (!message.contains("role") || !message.contains("content")) {
|
||||
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
|
||||
}
|
||||
std::string role = message.at("role");
|
||||
|
||||
if (message.contains("tool_calls")) {
|
||||
if (polyfill_object_arguments || polyfill_tool_calls) {
|
||||
for (auto & tool_call : message.at("tool_calls")) {
|
||||
if (tool_call["type"] == "function") {
|
||||
auto & function = tool_call.at("function");
|
||||
auto & arguments = function.at("arguments");
|
||||
if (arguments.is_string()) {
|
||||
try {
|
||||
arguments = json::parse(arguments.get<std::string>());
|
||||
} catch (const std::exception & ecvt) {
|
||||
fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (polyfill_tool_calls) {
|
||||
auto content = message.at("content");
|
||||
auto tool_calls = json::array();
|
||||
for (const auto & tool_call : message.at("tool_calls")) {
|
||||
if (tool_call.at("type") != "function") {
|
||||
continue;
|
||||
}
|
||||
const auto & function = tool_call.at("function");
|
||||
auto tc = json {
|
||||
{"name", function.at("name")},
|
||||
{"arguments", function.at("arguments")},
|
||||
};
|
||||
if (tool_call.contains("id")) {
|
||||
tc["id"] = tool_call["id"];
|
||||
}
|
||||
tool_calls.push_back(tc);
|
||||
}
|
||||
auto obj = json {
|
||||
{"tool_calls", tool_calls},
|
||||
};
|
||||
if (!content.is_null() && content != "") {
|
||||
obj["content"] = content;
|
||||
}
|
||||
message["content"] = obj.dump(2);
|
||||
message.erase("tool_calls");
|
||||
}
|
||||
}
|
||||
if (polyfill_tool_responses && role == "tool") {
|
||||
message["role"] = "user";
|
||||
auto obj = json {
|
||||
{"tool_response", {
|
||||
{"content", message.at("content")},
|
||||
}},
|
||||
};
|
||||
if (message.contains("name")) {
|
||||
obj["tool_response"]["name"] = message.at("name");
|
||||
}
|
||||
if (message.contains("tool_call_id")) {
|
||||
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
|
||||
}
|
||||
message["content"] = obj.dump(2);
|
||||
message.erase("name");
|
||||
}
|
||||
|
||||
if (!message["content"].is_null() && polyfill_system_role) {
|
||||
std::string content = message.at("content");
|
||||
if (role == "system") {
|
||||
if (!pending_system.empty()) pending_system += "\n";
|
||||
pending_system += content;
|
||||
continue;
|
||||
} else {
|
||||
if (role == "user") {
|
||||
if (!pending_system.empty()) {
|
||||
message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
|
||||
pending_system.clear();
|
||||
}
|
||||
} else {
|
||||
flush_sys();
|
||||
}
|
||||
}
|
||||
}
|
||||
add_message(message);
|
||||
}
|
||||
flush_sys();
|
||||
} else {
|
||||
actual_messages = inputs.messages;
|
||||
}
|
||||
|
||||
auto context = minja::Context::make(json({
|
||||
{"messages", actual_messages},
|
||||
{"add_generation_prompt", inputs.add_generation_prompt},
|
||||
}));
|
||||
context->set("bos_token", opts.use_bos_token ? bos_token_ : "");
|
||||
context->set("eos_token", opts.use_eos_token ? eos_token_ : "");
|
||||
if (opts.define_strftime_now) {
|
||||
auto now = inputs.now;
|
||||
context->set("strftime_now", Value::callable([now](const std::shared_ptr<minja::Context> &, minja::ArgumentsValue & args) {
|
||||
args.expectArgs("strftime_now", {1, 1}, {0, 0});
|
||||
auto format = args.args[0].get<std::string>();
|
||||
|
||||
auto time = std::chrono::system_clock::to_time_t(now);
|
||||
auto local_time = *std::localtime(&time);
|
||||
std::ostringstream ss;
|
||||
ss << std::put_time(&local_time, format.c_str());
|
||||
return ss.str();
|
||||
}));
|
||||
}
|
||||
if (!inputs.tools.is_null()) {
|
||||
context->set("tools", minja::Value(inputs.tools));
|
||||
}
|
||||
if (!inputs.extra_context.is_null()) {
|
||||
for (auto & kv : inputs.extra_context.items()) {
|
||||
context->set(kv.key(), minja::Value(kv.value()));
|
||||
}
|
||||
}
|
||||
|
||||
auto ret = template_root_->render(context);
|
||||
// fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str());
|
||||
// fprintf(stderr, "apply: %s\n\n", ret.c_str());
|
||||
return ret;
|
||||
}
|
||||
|
||||
static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
|
||||
json messages_with_system = messages;
|
||||
|
||||
if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
|
||||
std::string existing_system = messages_with_system.at(0).at("content");
|
||||
messages_with_system[0] = json {
|
||||
{"role", "system"},
|
||||
{"content", existing_system + "\n\n" + system_prompt},
|
||||
};
|
||||
} else {
|
||||
messages_with_system.insert(messages_with_system.begin(), json {
|
||||
{"role", "system"},
|
||||
{"content", system_prompt},
|
||||
});
|
||||
}
|
||||
return messages_with_system;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace minja
|
966
common/chat.cpp
966
common/chat.cpp
|
@ -1,966 +0,0 @@
|
|||
#include "chat.hpp"
|
||||
#include "chat-template.hpp"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "log.h"
|
||||
#include "minja.hpp"
|
||||
|
||||
std::string common_chat_format_name(common_chat_format format) {
|
||||
switch (format) {
|
||||
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
|
||||
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
|
||||
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
|
||||
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
|
||||
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
|
||||
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
|
||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
||||
default:
|
||||
throw std::runtime_error("Unknown chat format");
|
||||
}
|
||||
}
|
||||
|
||||
const common_grammar_options grammar_options {
|
||||
/* .dotall = */ false,
|
||||
/* .compact_spaces = */ false,
|
||||
// /* .compact_spaces = */ true,
|
||||
};
|
||||
|
||||
static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
|
||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
||||
std::size_t position;
|
||||
bool found_error;
|
||||
|
||||
json_error_locator() : position(0), found_error(false) {}
|
||||
|
||||
bool parse_error(std::size_t position, const std::string &, const json::exception &) override {
|
||||
this->position = position - 1;
|
||||
this->found_error = true;
|
||||
return false;
|
||||
}
|
||||
bool null() override { return true; }
|
||||
bool boolean(bool) override { return true; }
|
||||
bool number_integer(number_integer_t) override { return true; }
|
||||
bool number_unsigned(number_unsigned_t) override { return true; }
|
||||
bool number_float(number_float_t, const string_t &) override { return true; }
|
||||
bool string(string_t &) override { return true; }
|
||||
bool binary(binary_t &) override { return true; }
|
||||
bool start_object(std::size_t) override { return true; }
|
||||
bool key(string_t &) override { return true; }
|
||||
bool end_object() override { return true; }
|
||||
bool start_array(std::size_t) override { return true; }
|
||||
bool end_array() override { return true; }
|
||||
};
|
||||
json_error_locator err_loc;
|
||||
json::sax_parse(it, end, &err_loc);
|
||||
|
||||
std::string::const_iterator temptative_end;
|
||||
if (err_loc.found_error) {
|
||||
temptative_end = it + err_loc.position;
|
||||
} else {
|
||||
temptative_end = end;
|
||||
}
|
||||
std::string json_sub {it, temptative_end};
|
||||
try {
|
||||
out = json::parse(json_sub);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
} catch (const std::exception &) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
|
||||
* Aggregates the prefix, suffix and in-between text into the content.
|
||||
*/
|
||||
static common_chat_msg parse_json_tool_calls(
|
||||
const std::string& input,
|
||||
const std::optional<std::regex> & trigger_opt,
|
||||
const std::regex & function_regex,
|
||||
const std::regex & close_regex) {
|
||||
std::smatch match;
|
||||
|
||||
common_chat_msg result;
|
||||
result.role = "assistant";
|
||||
|
||||
|
||||
auto end = input.end();
|
||||
auto it = input.begin();
|
||||
|
||||
if (trigger_opt) {
|
||||
if (!std::regex_search(it, end, match, *trigger_opt)) {
|
||||
result.content = input;
|
||||
return result;
|
||||
}
|
||||
result.content = match.prefix().str();
|
||||
it = match.suffix().first;
|
||||
}
|
||||
|
||||
while (it != end) {
|
||||
std::sregex_iterator rend;
|
||||
std::sregex_iterator rit(it, end, function_regex);
|
||||
if (rit == rend) {
|
||||
fprintf(stderr, "No more tool calls found\n");
|
||||
result.content += std::string(it, end);
|
||||
break;
|
||||
}
|
||||
auto name = rit->str(1);
|
||||
result.content += std::string(it, rit->prefix().second);
|
||||
it = rit->suffix().first;
|
||||
|
||||
json arguments;
|
||||
if (!parse_json(it, end, arguments)) {
|
||||
throw std::runtime_error("Failed to parse json tool call arguments");
|
||||
}
|
||||
if (!std::regex_search(it, end, match, close_regex)) {
|
||||
throw std::runtime_error("Malformed input, missing closing pattern");
|
||||
}
|
||||
it = match.suffix().first;
|
||||
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
|
||||
auto content_end = input.find(prefix);
|
||||
size_t tc_start = std::string::npos;
|
||||
|
||||
common_chat_msg result;
|
||||
result.role = "assistant";
|
||||
const auto process_tool_calls = [&](const json & tool_calls) {
|
||||
for (const auto & tool_call : tool_calls) {
|
||||
const auto & arguments = tool_call["arguments"];
|
||||
result.tool_calls.push_back({
|
||||
tool_call["name"],
|
||||
arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
||||
tool_call.contains("id") ? tool_call["id"] : "",
|
||||
});
|
||||
}
|
||||
};
|
||||
if (content_end == std::string::npos) {
|
||||
result.content = input;
|
||||
} else {
|
||||
tc_start = content_end + prefix.size() - rstrip_prefix;
|
||||
result.content = input.substr(0, content_end);
|
||||
auto tool_calls = json::parse(input.substr(tc_start));
|
||||
process_tool_calls(tool_calls);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
|
||||
for (const auto & tool : tools) {
|
||||
if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) {
|
||||
LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str());
|
||||
continue;
|
||||
}
|
||||
fn(tool);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string apply(
|
||||
const common_chat_template & tmpl,
|
||||
const nlohmann::ordered_json & messages,
|
||||
const nlohmann::ordered_json & tools,
|
||||
bool add_generation_prompt,
|
||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
|
||||
{
|
||||
minja::chat_template_inputs tmpl_inputs;
|
||||
tmpl_inputs.messages = messages;
|
||||
tmpl_inputs.tools = tools;
|
||||
tmpl_inputs.add_generation_prompt = add_generation_prompt;
|
||||
tmpl_inputs.extra_context = extra_context;
|
||||
// TODO: add flag to control date/time, if only for testing purposes.
|
||||
// tmpl_inputs.now = std::chrono::system_clock::now();
|
||||
|
||||
minja::chat_template_options tmpl_opts;
|
||||
tmpl_opts.use_bos_token = false;
|
||||
tmpl_opts.use_eos_token = false;
|
||||
|
||||
return tmpl.apply(tmpl_inputs, tmpl_opts);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
auto tool_call_schemas = json::array();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
auto tool_schema = json {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"name", {
|
||||
{"type", "string"},
|
||||
{"const", function["name"]},
|
||||
}},
|
||||
{"arguments", function["parameters"]},
|
||||
}},
|
||||
{"required", json::array({"name", "arguments"})},
|
||||
};
|
||||
if (function.contains("description")) {
|
||||
tool_schema["description"] = function["description"];
|
||||
}
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_schema["properties"]["id"] = {
|
||||
{"type", "string"},
|
||||
{"minLength", 4},
|
||||
};
|
||||
tool_schema["required"].push_back("id");
|
||||
}
|
||||
tool_call_schemas.emplace_back(tool_schema);
|
||||
});
|
||||
const auto tool_call =
|
||||
inputs.parallel_tool_calls
|
||||
? json {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"tool_calls", {
|
||||
{"type", "array"},
|
||||
{"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
|
||||
{"anyOf", tool_call_schemas},
|
||||
}},
|
||||
{"minItems", 1},
|
||||
}},
|
||||
}},
|
||||
{"required", json::array({"tool_calls"})},
|
||||
}
|
||||
: json {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
|
||||
{"anyOf", tool_call_schemas},
|
||||
}},
|
||||
}},
|
||||
{"required", json::array({"tool_call"})},
|
||||
};
|
||||
const auto schema =
|
||||
inputs.tool_choice != "required"
|
||||
? json {
|
||||
{"anyOf", json::array({
|
||||
tool_call,
|
||||
{
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"response", inputs.json_schema.is_null()
|
||||
? json {{"type", "string"}}
|
||||
: inputs.json_schema
|
||||
},
|
||||
}},
|
||||
{"required", json::array({"response"})},
|
||||
},
|
||||
})}
|
||||
}
|
||||
: tool_call;
|
||||
|
||||
data.grammar_lazy = false;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
builder.add_schema("root", schema);
|
||||
}, grammar_options);
|
||||
|
||||
auto tweaked_messages = common_chat_template::add_system(
|
||||
inputs.messages,
|
||||
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
||||
|
||||
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_generic(const std::string & input) {
|
||||
json data = json::parse(input);
|
||||
common_chat_msg result;
|
||||
result.role = "assistant";
|
||||
if (data.contains("tool_calls")) {
|
||||
for (const auto & tool_call : data["tool_calls"]) {
|
||||
result.tool_calls.push_back({
|
||||
tool_call["name"],
|
||||
tool_call["arguments"].dump(),
|
||||
tool_call.contains("id") ? tool_call["id"] : "",
|
||||
});
|
||||
}
|
||||
} else if (data.contains("tool_call")) {
|
||||
result.tool_calls.push_back({
|
||||
data["tool_call"]["name"],
|
||||
data["tool_call"]["arguments"].dump(),
|
||||
/* id= */ "",
|
||||
});
|
||||
} else if (data.contains("response")) {
|
||||
const auto & response = data["response"];
|
||||
result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
common_chat_params data;
|
||||
data.grammar_lazy = inputs.tool_choice != "required";
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
auto schemas = json::array();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
schemas.push_back({
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
// Important note: the model is probably trained to take a JSON stringified arguments value.
|
||||
// It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object.
|
||||
{"name", {
|
||||
{"type", "string"},
|
||||
{"const", function["name"]},
|
||||
}},
|
||||
{"arguments", function["parameters"]},
|
||||
{"id", {
|
||||
{"type", "string"},
|
||||
// Nemo's template expects a 9-character alphanumeric ID.
|
||||
{"pattern", "^[a-zA-Z0-9]{9}$"},
|
||||
}},
|
||||
}},
|
||||
{"required", json::array({"name", "arguments", "id"})},
|
||||
});
|
||||
});
|
||||
auto schema = json {
|
||||
{"type", "array"},
|
||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
||||
{"minItems", 1},
|
||||
};
|
||||
if (!inputs.parallel_tool_calls) {
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) {
|
||||
return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]");
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
common_chat_params data;
|
||||
data.grammar_lazy = inputs.tool_choice != "required";
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
auto schemas = json::array();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
schemas.push_back({
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"tool_call_id", {
|
||||
{"type", "string"},
|
||||
// Command-R's template expects an integer string.
|
||||
{"pattern", "^[0-9]{1,10}$"},
|
||||
}},
|
||||
{"tool_name", {
|
||||
{"type", "string"},
|
||||
{"const", function["name"]},
|
||||
}},
|
||||
{"parameters", function["parameters"]},
|
||||
}},
|
||||
{"required", json::array({"tool_call_id", "tool_name", "parameters"})},
|
||||
});
|
||||
});
|
||||
auto schema = json {
|
||||
{"type", "array"},
|
||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
||||
{"minItems", 1},
|
||||
};
|
||||
if (!inputs.parallel_tool_calls) {
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
|
||||
data.preserved_tokens = {
|
||||
"<|START_RESPONSE|>",
|
||||
"<|END_RESPONSE|>",
|
||||
"<|START_THINKING|>",
|
||||
"<|END_THINKING|>",
|
||||
"<|END_ACTION|>",
|
||||
};
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
|
||||
static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
|
||||
static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
|
||||
std::smatch match;
|
||||
|
||||
common_chat_msg result;
|
||||
result.role = "assistant";
|
||||
if (std::regex_match(input, match, response_regex)) {
|
||||
result.content = match[1].str();
|
||||
} else if (std::regex_match(input, match, thought_action_regex)) {
|
||||
result.tool_plan = match[1].str();
|
||||
auto actions_str = match[2].str();
|
||||
auto actions = json::parse(actions_str);
|
||||
for (const auto & action : actions) {
|
||||
result.tool_calls.push_back({
|
||||
/* .name = */ action["tool_name"],
|
||||
/* .arguments = */ action["parameters"].dump(),
|
||||
/* .id = */ action["tool_call_id"],
|
||||
});
|
||||
}
|
||||
} else {
|
||||
LOG_ERR("Failed to parse command_r output");
|
||||
result.content = input;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
|
||||
if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
|
||||
throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties");
|
||||
}
|
||||
const auto & parameters_properties = parameters.at("properties");
|
||||
const auto & parameters_required = parameters.at("required");
|
||||
for (const auto & prop : expected_properties) {
|
||||
if (!parameters_properties.contains(prop)) {
|
||||
throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop);
|
||||
}
|
||||
if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) {
|
||||
throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop);
|
||||
}
|
||||
}
|
||||
if (parameters_properties.size() != expected_properties.size()) {
|
||||
throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", "));
|
||||
}
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct common_chat_inputs & inputs, bool allow_python_tag_builtin_tools) {
|
||||
auto builtin_tools = json::array();
|
||||
common_chat_params data;
|
||||
data.grammar_lazy = inputs.tool_choice != "required";
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
|
||||
auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
|
||||
if (name == "wolfram_alpha") {
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
|
||||
expect_tool_parameters(name, parameters, {"query"});
|
||||
} else if (name == "web_search" || name == "brave_search") {
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
|
||||
expect_tool_parameters(name, parameters, {"query"});
|
||||
} else if (name == "python" || name == "code_interpreter") {
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
|
||||
expect_tool_parameters(name, parameters, {"code"});
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::string> kvs;
|
||||
for (const auto & [key, value] : parameters.at("properties").items()) {
|
||||
kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value));
|
||||
}
|
||||
|
||||
tool_rules.push_back(
|
||||
builder.add_rule(
|
||||
name + "-call",
|
||||
"\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
|
||||
builtin_tools.push_back(name);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
std::string name = function["name"];
|
||||
auto parameters = function["parameters"];
|
||||
builder.resolve_refs(parameters);
|
||||
|
||||
// https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
|
||||
if (allow_python_tag_builtin_tools) {
|
||||
handle_builtin_tool(name, parameters);
|
||||
}
|
||||
tool_rules.push_back(
|
||||
builder.add_rule(
|
||||
name + "-call",
|
||||
"\"{\" space "
|
||||
"( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
|
||||
"\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
|
||||
builder.add_schema(name + "-args", parameters) +
|
||||
" \"}\""));
|
||||
data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
|
||||
});
|
||||
data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
|
||||
if (!builtin_tools.empty()) {
|
||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
||||
}
|
||||
builder.add_rule("root", string_join(tool_rules, " | "));
|
||||
}, grammar_options);
|
||||
data.additional_stops.push_back("<|eom_id|>");
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
|
||||
{"tools_in_user_message", false},
|
||||
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
||||
});
|
||||
data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
|
||||
? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
|
||||
: COMMON_CHAT_FORMAT_LLAMA_3_X;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
|
||||
// TODO: tighten & simplify the parser, don't accept leading text context.
|
||||
static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
|
||||
static std::regex close_regex("\\}");
|
||||
static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
|
||||
|
||||
if (with_builtin_tools) {
|
||||
std::smatch match;
|
||||
if (std::regex_match(input, match, builtin_call_regex)) {
|
||||
auto name = match[1].str();
|
||||
auto raw_args = match[2].str();
|
||||
|
||||
// TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
|
||||
auto it_eq = raw_args.find('=');
|
||||
auto arg_name = raw_args.substr(0, it_eq);
|
||||
auto arg_value_str = raw_args.substr(it_eq + 1);
|
||||
auto arg_value = json::parse(arg_value_str);
|
||||
|
||||
return {
|
||||
/* .role = */ "assistant",
|
||||
/* .content = */ match.prefix().str(),
|
||||
/* .tool_calls = */ {
|
||||
{
|
||||
/* .name = */ match[1],
|
||||
/* .arguments = */ (json {
|
||||
{arg_name, arg_value},
|
||||
}).dump(),
|
||||
/* .id = */ "",
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
common_chat_params data;
|
||||
data.grammar_lazy = inputs.tool_choice != "required";
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
std::string name = function["name"];
|
||||
auto parameters = function["parameters"];
|
||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
||||
tool_rules.push_back(builder.add_rule(name + "-call",
|
||||
"\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\""));
|
||||
});
|
||||
data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false});
|
||||
data.preserved_tokens = {
|
||||
"<|tool▁sep|>",
|
||||
"<|tool▁call▁end|>",
|
||||
};
|
||||
builder.add_rule("root", "\"<|tool▁calls▁begin|>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space");
|
||||
}, grammar_options);
|
||||
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.prompt = prompt;
|
||||
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) {
|
||||
static std::regex trigger_regex("<|tool▁calls▁begin|>");
|
||||
static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
|
||||
static std::regex close_regex("```<|tool▁call▁end|>");
|
||||
return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
fprintf(stderr, "%s\n", __func__);
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
|
||||
{"datetime", "Jan 29 2025 13:00:00 GMT"},
|
||||
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
||||
});
|
||||
if (!inputs.tools.is_null() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != "required";
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
auto schemas = json::array();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
schemas.push_back({
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"name", {
|
||||
{"type", "string"},
|
||||
{"const", function["name"]},
|
||||
}},
|
||||
{"arguments", function["parameters"]},
|
||||
}},
|
||||
{"required", json::array({"name", "arguments", "id"})},
|
||||
});
|
||||
});
|
||||
auto schema = json {
|
||||
{"type", "array"},
|
||||
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
||||
{"minItems", 1},
|
||||
};
|
||||
if (!inputs.parallel_tool_calls) {
|
||||
schema["maxItems"] = 1;
|
||||
}
|
||||
builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
|
||||
}, grammar_options);
|
||||
data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
|
||||
data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
|
||||
} else {
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
}
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) {
|
||||
return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
// >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
|
||||
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
|
||||
if (!inputs.tools.is_null() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != "required";
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> first_tool_rules;
|
||||
std::vector<std::string> subsequent_tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
std::string name = function["name"];
|
||||
auto parameters = function["parameters"];
|
||||
auto args_rule = builder.add_schema(name + "-args", parameters);
|
||||
first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
|
||||
subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
|
||||
data.grammar_triggers.push_back({name, /* .at_start = */ true});
|
||||
data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
|
||||
});
|
||||
auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
|
||||
if (inputs.parallel_tool_calls) {
|
||||
auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
|
||||
builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*");
|
||||
} else {
|
||||
builder.add_rule("root", first_rule);
|
||||
}
|
||||
|
||||
}, grammar_options);
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
|
||||
auto expected_it = expected.begin();
|
||||
auto tmp_it = it;
|
||||
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
|
||||
++tmp_it;
|
||||
++expected_it;
|
||||
}
|
||||
if (expected_it == expected.end()) {
|
||||
it = tmp_it;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
|
||||
static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
|
||||
static std::regex close_regex(R"($|(?=>>>))");
|
||||
|
||||
std::string content;
|
||||
auto it = input.begin();
|
||||
const auto end = input.end();
|
||||
|
||||
if (consume(it, end, "all\n")) {
|
||||
std::smatch match;
|
||||
if (std::regex_search(it, end, match, function_regex)) {
|
||||
auto fun_it = match.prefix().second;
|
||||
content = std::string(it, fun_it);
|
||||
it = fun_it;
|
||||
} else {
|
||||
common_chat_msg res;
|
||||
res.role = "assistant";
|
||||
res.content = std::string(it, end);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
// TODO: tighten & simplify.
|
||||
try {
|
||||
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
|
||||
res.content = content + res.content;
|
||||
return res;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
|
||||
common_chat_msg res;
|
||||
res.role = "assistant";
|
||||
res.content = input;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
// https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
|
||||
common_chat_params data;
|
||||
json tools = inputs.tools.is_null() ? inputs.tools : json::array();
|
||||
std::string python_code_argument_name;
|
||||
auto has_raw_python = false;
|
||||
|
||||
data.grammar_lazy = inputs.tool_choice != "required";
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
const auto & parameters = function["parameters"];
|
||||
std::string name = function["name"];
|
||||
if (name == "python" || name == "ipython") {
|
||||
if (!parameters.contains("type")) {
|
||||
throw std::runtime_error("Missing type in python tool");
|
||||
}
|
||||
has_raw_python = true;
|
||||
auto type = parameters.at("type");
|
||||
if (type == "object") {
|
||||
auto properties = parameters.at("properties");
|
||||
for (auto it = properties.begin(); it != properties.end(); ++it) {
|
||||
if (it.value().at("type") == "string") {
|
||||
if (!python_code_argument_name.empty()) {
|
||||
throw std::runtime_error("Multiple string arguments found in python tool");
|
||||
}
|
||||
python_code_argument_name = it.key();
|
||||
}
|
||||
}
|
||||
if (python_code_argument_name.empty()) {
|
||||
throw std::runtime_error("No string argument found in python tool");
|
||||
}
|
||||
} else if (type != "string") {
|
||||
throw std::runtime_error("Invalid type in python tool: " + type.dump());
|
||||
}
|
||||
}
|
||||
tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
|
||||
});
|
||||
if (has_raw_python) {
|
||||
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
|
||||
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
|
||||
}
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
|
||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
||||
data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
|
||||
}, grammar_options);
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
// TODO: if (has_raw_python)
|
||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
|
||||
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
|
||||
static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
|
||||
std::smatch match;
|
||||
if (std::regex_search(input, match, python_tag_regex)) {
|
||||
auto code = match[1].str();
|
||||
return {
|
||||
/* .role = */ "assistant",
|
||||
/* .content = */ match.prefix().str(),
|
||||
/* .tool_calls = */ {
|
||||
{
|
||||
/* .name = */ "python",
|
||||
/* .arguments = */ (json {{"code", code}}).dump(),
|
||||
/* .id = */ "",
|
||||
},
|
||||
}
|
||||
};
|
||||
}
|
||||
static std::regex function_regex(R"(<function=(\w+)>)");
|
||||
static std::regex close_regex(R"(</function>)");
|
||||
// TODO: tighten & simplify.
|
||||
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
common_chat_params data;
|
||||
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
|
||||
data.grammar_lazy = inputs.tool_choice != "required";
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool["function"];
|
||||
std::string name = function["name"];
|
||||
auto parameters = function["parameters"];
|
||||
builder.resolve_refs(parameters);
|
||||
tool_rules.push_back(builder.add_schema(name + "-call", {
|
||||
{"type", "object"},
|
||||
{"properties", json {
|
||||
{"name", json {{"const", name}}},
|
||||
{"arguments", parameters},
|
||||
}},
|
||||
{"required", json::array({"name", "arguments"})},
|
||||
}));
|
||||
});
|
||||
auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
|
||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
||||
data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
|
||||
data.preserved_tokens = { "</tool_call>" };
|
||||
}, grammar_options);
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
|
||||
try {
|
||||
std::regex start_pattern(R"([\n\s]*<tool_call>)");
|
||||
std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
|
||||
std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
|
||||
|
||||
auto end = input.end();
|
||||
std::sregex_iterator rend;
|
||||
std::sregex_iterator rit(input.begin(), end, start_pattern);
|
||||
if (rit == rend) {
|
||||
return {
|
||||
/* .role = */ "assistant",
|
||||
/* .content = */ input,
|
||||
/* .tool_calls = */ {},
|
||||
};
|
||||
}
|
||||
|
||||
common_chat_msg result;
|
||||
result.role = "assistant";
|
||||
result.content = rit->prefix();
|
||||
|
||||
auto it = rit->suffix().first;
|
||||
while (it != end) {
|
||||
json call;
|
||||
if (!parse_json(it, end, call)) {
|
||||
throw std::runtime_error("Failed to parse json tool call");
|
||||
}
|
||||
const auto & arguments = call["arguments"];
|
||||
result.tool_calls.push_back({
|
||||
call["name"],
|
||||
arguments.dump(),
|
||||
// arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
|
||||
/* id= */ "",
|
||||
});
|
||||
rit = {it, end, middle_pattern};
|
||||
if (rit != rend) {
|
||||
it = rit->suffix().first;
|
||||
} else {
|
||||
rit = {it, end, end_pattern};
|
||||
if (rit == rend) {
|
||||
throw std::runtime_error("Malformed input, missing </tool_call>");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
} catch (const std::exception & e) {
|
||||
return {
|
||||
/* .role = */ "assistant",
|
||||
/* .content = */ input,
|
||||
/* .tool_calls = */ {},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
data.grammar_lazy = false;
|
||||
if (!inputs.json_schema.is_null()) {
|
||||
if (!inputs.grammar.empty()) {
|
||||
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
|
||||
}
|
||||
data.grammar = json_schema_to_grammar(inputs.json_schema);
|
||||
} else {
|
||||
data.grammar = inputs.grammar.empty();
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
|
||||
auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none";
|
||||
LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false");
|
||||
|
||||
if (has_tools && !inputs.grammar.empty()) {
|
||||
throw std::runtime_error("Cannot specify grammar with tools");
|
||||
}
|
||||
|
||||
const auto & src = tmpl.source();
|
||||
if (src.find(">>>all") != std::string::npos) {
|
||||
// Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when
|
||||
return common_chat_params_init_functionary_v3_2(tmpl, inputs);
|
||||
}
|
||||
if (src.find(" functools[") != std::string::npos) {
|
||||
// Firefunction v2 requires datetime and functions in the context, even w/o tools.
|
||||
return common_chat_params_init_firefunction_v2(tmpl, inputs);
|
||||
}
|
||||
|
||||
if (!has_tools) {
|
||||
return common_chat_params_init_without_tools(tmpl, inputs);
|
||||
}
|
||||
|
||||
if (src.find("<tool_call>") != std::string::npos) {
|
||||
return common_chat_params_init_hermes_2_pro(tmpl, inputs);
|
||||
}
|
||||
if (src.find("<|start_header_id|>") != std::string::npos
|
||||
&& src.find("<function=") != std::string::npos) {
|
||||
return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, inputs);
|
||||
}
|
||||
if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
|
||||
auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
|
||||
return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools);
|
||||
}
|
||||
if (src.find("<|tool▁calls▁begin|>") != std::string::npos) {
|
||||
return common_chat_params_init_deepseek_r1(tmpl, inputs);
|
||||
}
|
||||
if (src.find("[TOOL_CALLS]") != std::string::npos) {
|
||||
return common_chat_params_init_mistral_nemo(tmpl, inputs);
|
||||
}
|
||||
if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) {
|
||||
return common_chat_params_init_command_r7b(tmpl, inputs);
|
||||
}
|
||||
return common_chat_params_init_generic(tmpl, inputs);
|
||||
}
|
||||
|
||||
static common_chat_msg common_chat_parse_content_only(const std::string & input) {
|
||||
return {
|
||||
/* .role = */ "assistant",
|
||||
/* .content = */ input,
|
||||
/* .tool_calls = */ {},
|
||||
};
|
||||
}
|
||||
|
||||
common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) {
|
||||
switch (format) {
|
||||
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
|
||||
return common_chat_parse_content_only(input);
|
||||
case COMMON_CHAT_FORMAT_GENERIC:
|
||||
return common_chat_parse_generic(input);
|
||||
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
|
||||
return common_chat_parse_mistral_nemo(input);
|
||||
case COMMON_CHAT_FORMAT_LLAMA_3_X:
|
||||
return common_chat_parse_llama_3_1(input);
|
||||
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
|
||||
return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
|
||||
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
|
||||
return common_chat_parse_deepseek_r1(input);
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
|
||||
return common_chat_parse_functionary_v3_2(input);
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
|
||||
return common_chat_parse_functionary_v3_1_llama_3_1(input);
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
|
||||
return common_chat_parse_hermes_2_pro(input);
|
||||
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
|
||||
return common_chat_parse_firefunction_v2(input);
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
||||
return common_chat_parse_command_r7b(input);
|
||||
default:
|
||||
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
|
||||
}
|
||||
}
|
|
@ -1,52 +0,0 @@
|
|||
// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include <json.hpp>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
struct common_chat_inputs {
|
||||
json messages;
|
||||
json tools;
|
||||
json tool_choice;
|
||||
json json_schema;
|
||||
bool parallel_tool_calls;
|
||||
bool stream;
|
||||
std::string grammar;
|
||||
bool add_generation_prompt = true;
|
||||
};
|
||||
|
||||
enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_CONTENT_ONLY,
|
||||
COMMON_CHAT_FORMAT_GENERIC,
|
||||
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
|
||||
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
||||
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
||||
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||
|
||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||
};
|
||||
|
||||
struct common_chat_params {
|
||||
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
json prompt;
|
||||
std::string grammar;
|
||||
bool grammar_lazy = false;
|
||||
std::vector<common_grammar_trigger> grammar_triggers;
|
||||
std::vector<std::string> preserved_tokens;
|
||||
std::vector<std::string> additional_stops;
|
||||
};
|
||||
|
||||
struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
|
||||
std::string common_chat_format_name(common_chat_format format);
|
||||
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
|
3383
common/common.cpp
3383
common/common.cpp
File diff suppressed because it is too large
Load diff
549
common/common.h
549
common/common.h
|
@ -2,12 +2,20 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "llama-cpp.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <set>
|
||||
#include "sampling.h"
|
||||
|
||||
#define LOG_NO_FILE_LINE_FUNCTION
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
|
@ -25,192 +33,47 @@
|
|||
|
||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||
|
||||
struct common_adapter_lora_info {
|
||||
std::string path;
|
||||
float scale;
|
||||
|
||||
struct llama_adapter_lora * ptr;
|
||||
};
|
||||
|
||||
using llama_tokens = std::vector<llama_token>;
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern const char * LLAMA_COMMIT;
|
||||
extern const char * LLAMA_COMPILER;
|
||||
extern const char * LLAMA_BUILD_TARGET;
|
||||
extern char const * LLAMA_COMMIT;
|
||||
extern char const * LLAMA_COMPILER;
|
||||
extern char const * LLAMA_BUILD_TARGET;
|
||||
|
||||
struct common_control_vector_load_info;
|
||||
struct llama_control_vector_load_info;
|
||||
|
||||
//
|
||||
// CPU utils
|
||||
//
|
||||
|
||||
struct cpu_params {
|
||||
int n_threads = -1;
|
||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||
bool mask_valid = false; // Default: any CPU
|
||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||
bool strict_cpu = false; // Use strict CPU placement
|
||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||
};
|
||||
|
||||
int32_t cpu_get_num_physical_cores();
|
||||
int32_t cpu_get_num_math();
|
||||
|
||||
//
|
||||
// Common params
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_INFILL,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
LLAMA_EXAMPLE_PASSKEY,
|
||||
LLAMA_EXAMPLE_IMATRIX,
|
||||
LLAMA_EXAMPLE_BENCH,
|
||||
LLAMA_EXAMPLE_SERVER,
|
||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
LLAMA_EXAMPLE_LOOKUP,
|
||||
LLAMA_EXAMPLE_PARALLEL,
|
||||
LLAMA_EXAMPLE_TTS,
|
||||
struct gpt_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
||||
enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_NONE = 0,
|
||||
COMMON_SAMPLER_TYPE_DRY = 1,
|
||||
COMMON_SAMPLER_TYPE_TOP_K = 2,
|
||||
COMMON_SAMPLER_TYPE_TOP_P = 3,
|
||||
COMMON_SAMPLER_TYPE_MIN_P = 4,
|
||||
//COMMON_SAMPLER_TYPE_TFS_Z = 5,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
|
||||
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
|
||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
enum dimre_method {
|
||||
DIMRE_METHOD_PCA,
|
||||
DIMRE_METHOD_MEAN,
|
||||
};
|
||||
|
||||
enum common_conversation_mode {
|
||||
COMMON_CONVERSATION_MODE_DISABLED = 0,
|
||||
COMMON_CONVERSATION_MODE_ENABLED = 1,
|
||||
COMMON_CONVERSATION_MODE_AUTO = 2,
|
||||
};
|
||||
|
||||
struct common_grammar_trigger {
|
||||
std::string word;
|
||||
bool at_start;
|
||||
};
|
||||
|
||||
// sampling parameters
|
||||
struct common_params_sampling {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float xtc_probability = 0.00f; // 0.0 = disabled
|
||||
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
||||
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
||||
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
||||
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool timing_per_token = false;
|
||||
|
||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||
|
||||
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||
COMMON_SAMPLER_TYPE_TOP_P,
|
||||
COMMON_SAMPLER_TYPE_MIN_P,
|
||||
COMMON_SAMPLER_TYPE_XTC,
|
||||
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
||||
};
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
bool grammar_lazy = false;
|
||||
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
|
||||
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
|
||||
std::set<llama_token> preserved_tokens;
|
||||
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_ctx = 0; // draft context size
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
|
||||
std::string model = ""; // draft model for speculative decoding // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
};
|
||||
|
||||
struct common_params_vocoder {
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
|
||||
std::string model = ""; // model path // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
|
||||
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
||||
};
|
||||
|
||||
struct common_params {
|
||||
int32_t n_threads = cpu_get_num_math();
|
||||
int32_t n_threads_draft = -1;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 4096; // context size
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||
|
@ -221,56 +84,46 @@ struct common_params {
|
|||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
||||
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
|
||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||
|
||||
struct common_params_sampling sampling;
|
||||
struct common_params_speculative speculative;
|
||||
struct common_params_vocoder vocoder;
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
std::string model = ""; // model path // NOLINT
|
||||
std::string model_alias = ""; // model alias // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
std::string hf_token = ""; // HF token // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string prompt = ""; // NOLINT
|
||||
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||
std::string model = ""; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string prompt = "";
|
||||
std::string prompt_file = ""; // store the external prompt file name
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
|
||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
||||
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
||||
// TODO: avoid tuple, use struct
|
||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
|
||||
int32_t verbosity = 0;
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
|
@ -296,66 +149,54 @@ struct common_params {
|
|||
bool special = false; // enable special token output
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||
bool multiline_input = false; // reverse the usage of `\`
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool flash_attn = false; // flash attention
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool ignore_eos = false; // ignore generated EOS tokens
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
bool infill = false; // use infill mode
|
||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
|
||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// embedding
|
||||
bool embedding = false; // get only sentence embedding
|
||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||
std::string embd_sep = "\n"; // separator of embeddings
|
||||
bool reranking = false; // enable reranking support on server
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = ""; // NOLINT
|
||||
std::string chat_template = ""; // NOLINT
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
std::string public_path = "";
|
||||
std::string chat_template = "";
|
||||
std::string system_prompt = "";
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
std::string ssl_file_key = ""; // NOLINT
|
||||
std::string ssl_file_cert = ""; // NOLINT
|
||||
std::string ssl_file_key = "";
|
||||
std::string ssl_file_cert = "";
|
||||
|
||||
// "advanced" endpoints are disabled by default for better security
|
||||
bool webui = true;
|
||||
bool endpoint_slots = false;
|
||||
bool endpoint_props = false; // only control POST requests, not GET
|
||||
bool endpoint_slots = true;
|
||||
bool endpoint_metrics = false;
|
||||
|
||||
bool log_json = false;
|
||||
|
@ -391,63 +232,28 @@ struct common_params {
|
|||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
|
||||
// cvector-generator params
|
||||
int n_pca_batch = 100;
|
||||
int n_pca_iterations = 1000;
|
||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||
std::string cvector_outfile = "control_vector.gguf";
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||
|
||||
// batched-bench params
|
||||
bool batched_bench_output_jsonl = false;
|
||||
};
|
||||
|
||||
// call once at the start of a program if it uses libcommon
|
||||
// initializes the logging system and prints info about the build
|
||||
void common_init();
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
|
||||
std::string common_params_get_system_info(const common_params & params);
|
||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
||||
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
||||
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||
|
||||
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
|
||||
bool set_process_priority(enum ggml_sched_priority prio);
|
||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||
|
||||
//
|
||||
// String utils
|
||||
//
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
#else
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||
std::string string_format(const char * fmt, ...);
|
||||
std::vector<std::string> string_split(std::string input, char separator);
|
||||
|
||||
std::string string_strip(const std::string & str);
|
||||
std::string string_get_sortable_timestamp();
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
|
||||
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
|
||||
std::string string_repeat(const std::string & str, size_t n);
|
||||
|
||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
||||
|
||||
template<class T>
|
||||
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
|
||||
std::vector<T> values;
|
||||
std::istringstream str_stream(str);
|
||||
std::string token;
|
||||
|
@ -460,40 +266,9 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
|||
return values;
|
||||
}
|
||||
|
||||
template<>
|
||||
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
|
||||
{
|
||||
std::vector<std::string> parts;
|
||||
size_t begin_pos = 0;
|
||||
size_t separator_pos = input.find(separator);
|
||||
while (separator_pos != std::string::npos) {
|
||||
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
|
||||
parts.emplace_back(part);
|
||||
begin_pos = separator_pos + 1;
|
||||
separator_pos = input.find(separator, begin_pos);
|
||||
}
|
||||
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
|
||||
return parts;
|
||||
}
|
||||
|
||||
static bool string_starts_with(const std::string & str,
|
||||
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
|
||||
return str.rfind(prefix, 0) == 0;
|
||||
}
|
||||
|
||||
static bool string_ends_with(const std::string & str,
|
||||
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
|
||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||
}
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
void string_process_escapes(std::string & input);
|
||||
|
||||
std::string string_from(bool value);
|
||||
std::string string_from(const std::vector<int> & values);
|
||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
||||
|
||||
//
|
||||
// Filesystem utils
|
||||
//
|
||||
|
@ -508,193 +283,108 @@ std::string fs_get_cache_file(const std::string & filename);
|
|||
// Model utils
|
||||
//
|
||||
|
||||
// note: defines object's lifetime
|
||||
struct common_init_result {
|
||||
llama_model_ptr model;
|
||||
llama_context_ptr context;
|
||||
// TODO: avoid tuplue, use struct
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> lora;
|
||||
};
|
||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params);
|
||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
|
||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
|
||||
|
||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const std::string & model_url,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const std::string & repo,
|
||||
const std::string & remote_path,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
|
||||
std::pair<std::string, std::string> common_get_hf_file(
|
||||
const std::string & hf_repo_with_tag,
|
||||
const std::string & hf_token);
|
||||
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
||||
|
||||
//
|
||||
// Batch utils
|
||||
//
|
||||
|
||||
void common_batch_clear(struct llama_batch & batch);
|
||||
void llama_batch_clear(struct llama_batch & batch);
|
||||
|
||||
void common_batch_add(
|
||||
void llama_batch_add(
|
||||
struct llama_batch & batch,
|
||||
llama_token id,
|
||||
llama_pos pos,
|
||||
const std::vector<llama_seq_id> & seq_ids,
|
||||
bool logits);
|
||||
|
||||
//
|
||||
// Token utils
|
||||
//
|
||||
|
||||
// longest common prefix
|
||||
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
|
||||
|
||||
// longet common subsequence
|
||||
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
// tokenizes a string into a vector of tokens
|
||||
// should work similar to Python's `tokenizer.encode`
|
||||
std::vector<llama_token> common_tokenize(
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
std::vector<llama_token> common_tokenize(
|
||||
const struct llama_vocab * vocab,
|
||||
std::vector<llama_token> llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||
// should work similar to Python's `tokenizer.id_to_piece`
|
||||
std::string common_token_to_piece(
|
||||
std::string llama_token_to_piece(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token,
|
||||
bool special = true);
|
||||
|
||||
std::string common_token_to_piece(
|
||||
const struct llama_vocab * vocab,
|
||||
llama_token token,
|
||||
bool special = true);
|
||||
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
||||
// that takes into account the tokenizer type and decides how to handle the leading space
|
||||
//
|
||||
// detokenizes a vector of tokens into a string
|
||||
// should work similar to Python's `tokenizer.decode`
|
||||
// removes the leading space from the first non-BOS token
|
||||
std::string llama_detokenize_spm(
|
||||
llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens);
|
||||
|
||||
// detokenizes a vector of tokens into a string
|
||||
// should work similar to Python's `tokenizer.decode`
|
||||
// optionally renders special/control tokens
|
||||
std::string common_detokenize(
|
||||
const struct llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special = true);
|
||||
std::string llama_detokenize_bpe(
|
||||
llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens);
|
||||
|
||||
std::string common_detokenize(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special = true);
|
||||
// Uses the value from the model metadata if possible, otherwise
|
||||
// defaults to true when model type is SPM, otherwise false.
|
||||
bool llama_should_add_bos_token(const llama_model * model);
|
||||
|
||||
//
|
||||
// Chat template utils
|
||||
//
|
||||
|
||||
struct common_tool_call {
|
||||
std::string name;
|
||||
std::string arguments;
|
||||
std::string id;
|
||||
};
|
||||
|
||||
// same with llama_chat_message, but uses std::string
|
||||
struct common_chat_msg {
|
||||
std::string role;
|
||||
std::string content;
|
||||
std::vector<common_tool_call> tool_calls;
|
||||
std::string tool_plan = "";
|
||||
};
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
||||
|
||||
namespace minja {
|
||||
class chat_template;
|
||||
}
|
||||
|
||||
typedef minja::chat_template common_chat_template;
|
||||
|
||||
struct common_chat_templates {
|
||||
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
||||
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
||||
std::unique_ptr<common_chat_template> template_tool_use;
|
||||
};
|
||||
|
||||
// CPP wrapper for llama_chat_apply_template
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
// If the custom "tmpl" is not supported, we throw an error
|
||||
std::string common_chat_apply_template(
|
||||
const common_chat_template & tmpl,
|
||||
const std::vector<common_chat_msg> & chat,
|
||||
bool add_ass,
|
||||
bool use_jinja);
|
||||
|
||||
// Format single message, while taking into account the position of that message in chat history
|
||||
std::string common_chat_format_single(
|
||||
const common_chat_template & tmpl,
|
||||
const std::vector<common_chat_msg> & past_msg,
|
||||
const common_chat_msg & new_msg,
|
||||
bool add_ass,
|
||||
bool use_jinja);
|
||||
|
||||
// Returns an example of formatted chat
|
||||
std::string common_chat_format_example(
|
||||
const common_chat_template & tmpl, bool use_jinja);
|
||||
|
||||
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
|
||||
bool llama_chat_verify_template(const std::string & tmpl);
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
|
||||
// Dump the KV cache view with the number of sequences per cell.
|
||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||
|
||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||
|
||||
//
|
||||
// Embedding utils
|
||||
//
|
||||
|
||||
// TODO: repace embd_norm with an enum
|
||||
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
|
||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
||||
|
||||
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||
|
||||
//
|
||||
// Control vector utils
|
||||
//
|
||||
|
||||
struct common_control_vector_data {
|
||||
struct llama_control_vector_data {
|
||||
int n_embd;
|
||||
|
||||
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct common_control_vector_load_info {
|
||||
struct llama_control_vector_load_info {
|
||||
float strength;
|
||||
|
||||
std::string fname;
|
||||
|
@ -702,16 +392,25 @@ struct common_control_vector_load_info {
|
|||
|
||||
// Load control vectors, scale each by strength, and add them together.
|
||||
// On error, returns {-1, empty}
|
||||
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
|
||||
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
||||
|
||||
//
|
||||
// Split utils
|
||||
//
|
||||
|
||||
namespace {
|
||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
|
||||
const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||
const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
//
|
||||
// YAML utils
|
||||
//
|
||||
|
||||
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
||||
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
||||
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||
|
||||
void yaml_dump_non_result_info(
|
||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||
|
||||
}
|
||||
|
|
|
@ -94,9 +94,6 @@ namespace console {
|
|||
simple_io = true;
|
||||
}
|
||||
}
|
||||
if (simple_io) {
|
||||
_setmode(_fileno(stdin), _O_U8TEXT);
|
||||
}
|
||||
#else
|
||||
// POSIX-specific console initialization
|
||||
if (!simple_io) {
|
||||
|
|
536
common/grammar-parser.cpp
Normal file
536
common/grammar-parser.cpp
Normal file
|
@ -0,0 +1,536 @@
|
|||
#include "grammar-parser.h"
|
||||
#include <cstdint>
|
||||
#include <cwchar>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <stdexcept>
|
||||
#include <exception>
|
||||
|
||||
namespace grammar_parser {
|
||||
// NOTE: assumes valid utf8 (but checks for overrun)
|
||||
// copied from llama.cpp
|
||||
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t first_byte = static_cast<uint8_t>(*src);
|
||||
uint8_t highbits = first_byte >> 4;
|
||||
int len = lookup[highbits];
|
||||
uint8_t mask = (1 << (8 - len)) - 1;
|
||||
uint32_t value = first_byte & mask;
|
||||
const char * end = src + len; // may overrun!
|
||||
const char * pos = src + 1;
|
||||
for ( ; pos < end && *pos; pos++) {
|
||||
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
||||
}
|
||||
return std::make_pair(value, pos);
|
||||
}
|
||||
|
||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
||||
return result.first->second;
|
||||
}
|
||||
|
||||
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
|
||||
return next_id;
|
||||
}
|
||||
|
||||
static void add_rule(
|
||||
parse_state & state,
|
||||
uint32_t rule_id,
|
||||
const std::vector<llama_grammar_element> & rule) {
|
||||
if (state.rules.size() <= rule_id) {
|
||||
state.rules.resize(rule_id + 1);
|
||||
}
|
||||
state.rules[rule_id] = rule;
|
||||
}
|
||||
|
||||
static bool is_digit_char(char c) {
|
||||
return '0' <= c && c <= '9';
|
||||
}
|
||||
|
||||
static bool is_word_char(char c) {
|
||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
|
||||
}
|
||||
|
||||
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||
const char * pos = src;
|
||||
const char * end = src + size;
|
||||
uint32_t value = 0;
|
||||
for ( ; pos < end && *pos; pos++) {
|
||||
value <<= 4;
|
||||
char c = *pos;
|
||||
if ('a' <= c && c <= 'f') {
|
||||
value += c - 'a' + 10;
|
||||
} else if ('A' <= c && c <= 'F') {
|
||||
value += c - 'A' + 10;
|
||||
} else if ('0' <= c && c <= '9') {
|
||||
value += c - '0';
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (pos != end) {
|
||||
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
|
||||
}
|
||||
return std::make_pair(value, pos);
|
||||
}
|
||||
|
||||
static const char * parse_space(const char * src, bool newline_ok) {
|
||||
const char * pos = src;
|
||||
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
|
||||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
|
||||
if (*pos == '#') {
|
||||
while (*pos && *pos != '\r' && *pos != '\n') {
|
||||
pos++;
|
||||
}
|
||||
} else {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static const char * parse_name(const char * src) {
|
||||
const char * pos = src;
|
||||
while (is_word_char(*pos)) {
|
||||
pos++;
|
||||
}
|
||||
if (pos == src) {
|
||||
throw std::runtime_error(std::string("expecting name at ") + src);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static const char * parse_int(const char * src) {
|
||||
const char * pos = src;
|
||||
while (is_digit_char(*pos)) {
|
||||
pos++;
|
||||
}
|
||||
if (pos == src) {
|
||||
throw std::runtime_error(std::string("expecting integer at ") + src);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||
if (*src == '\\') {
|
||||
switch (src[1]) {
|
||||
case 'x': return parse_hex(src + 2, 2);
|
||||
case 'u': return parse_hex(src + 2, 4);
|
||||
case 'U': return parse_hex(src + 2, 8);
|
||||
case 't': return std::make_pair('\t', src + 2);
|
||||
case 'r': return std::make_pair('\r', src + 2);
|
||||
case 'n': return std::make_pair('\n', src + 2);
|
||||
case '\\':
|
||||
case '"':
|
||||
case '[':
|
||||
case ']':
|
||||
return std::make_pair(src[1], src + 2);
|
||||
default:
|
||||
throw std::runtime_error(std::string("unknown escape at ") + src);
|
||||
}
|
||||
} else if (*src) {
|
||||
return decode_utf8(src);
|
||||
}
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
|
||||
const char * parse_alternates(
|
||||
parse_state & state,
|
||||
const char * src,
|
||||
const std::string & rule_name,
|
||||
uint32_t rule_id,
|
||||
bool is_nested);
|
||||
|
||||
static const char * parse_sequence(
|
||||
parse_state & state,
|
||||
const char * src,
|
||||
const std::string & rule_name,
|
||||
std::vector<llama_grammar_element> & out_elements,
|
||||
bool is_nested) {
|
||||
size_t last_sym_start = out_elements.size();
|
||||
const char * pos = src;
|
||||
|
||||
auto handle_repetitions = [&](int min_times, int max_times) {
|
||||
|
||||
if (last_sym_start == out_elements.size()) {
|
||||
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
||||
}
|
||||
|
||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||
// the following rewrite rules:
|
||||
// S{m,n} --> S S S (m times) S'(n-m)
|
||||
// S'(x) ::= S S'(x-1) |
|
||||
// (... n-m definitions of these S' rules ...)
|
||||
// S'(1) ::= S |
|
||||
// S{m,} --> S S S (m times) S'
|
||||
// S' ::= S S' |
|
||||
// S* --> S{0,}
|
||||
// --> S' ::= S S' |
|
||||
// S+ --> S{1,}
|
||||
// --> S S'
|
||||
// S' ::= S S' |
|
||||
// S? --> S{0,1}
|
||||
// --> S'
|
||||
// S' ::= S |
|
||||
|
||||
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
|
||||
if (min_times == 0) {
|
||||
out_elements.resize(last_sym_start);
|
||||
} else {
|
||||
// Repeat the previous elements (min_times - 1) times
|
||||
for (int i = 1; i < min_times; i++) {
|
||||
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t last_rec_rule_id = 0;
|
||||
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
||||
|
||||
std::vector<llama_grammar_element> rec_rule(previous_elements);
|
||||
for (int i = 0; i < n_opt; i++) {
|
||||
rec_rule.resize(previous_elements.size());
|
||||
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
|
||||
if (i > 0 || max_times < 0) {
|
||||
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
||||
}
|
||||
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||
add_rule(state, rec_rule_id, rec_rule);
|
||||
last_rec_rule_id = rec_rule_id;
|
||||
}
|
||||
if (n_opt > 0) {
|
||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
||||
}
|
||||
};
|
||||
|
||||
while (*pos) {
|
||||
if (*pos == '"') { // literal string
|
||||
pos++;
|
||||
last_sym_start = out_elements.size();
|
||||
while (*pos != '"') {
|
||||
if (!*pos) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto char_pair = parse_char(pos);
|
||||
pos = char_pair.second;
|
||||
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '[') { // char range(s)
|
||||
pos++;
|
||||
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
|
||||
if (*pos == '^') {
|
||||
pos++;
|
||||
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
||||
}
|
||||
last_sym_start = out_elements.size();
|
||||
while (*pos != ']') {
|
||||
if (!*pos) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto char_pair = parse_char(pos);
|
||||
pos = char_pair.second;
|
||||
enum llama_gretype type = last_sym_start < out_elements.size()
|
||||
? LLAMA_GRETYPE_CHAR_ALT
|
||||
: start_type;
|
||||
|
||||
out_elements.push_back({type, char_pair.first});
|
||||
if (pos[0] == '-' && pos[1] != ']') {
|
||||
if (!pos[1]) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
auto endchar_pair = parse_char(pos + 1);
|
||||
pos = endchar_pair.second;
|
||||
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
||||
}
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (is_word_char(*pos)) { // rule reference
|
||||
const char * name_end = parse_name(pos);
|
||||
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
|
||||
pos = parse_space(name_end, is_nested);
|
||||
last_sym_start = out_elements.size();
|
||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
||||
} else if (*pos == '(') { // grouping
|
||||
// parse nested alternates into synthesized rule
|
||||
pos = parse_space(pos + 1, true);
|
||||
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
||||
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
|
||||
last_sym_start = out_elements.size();
|
||||
// output reference to synthesized rule
|
||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
||||
if (*pos != ')') {
|
||||
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '.') { // any char
|
||||
last_sym_start = out_elements.size();
|
||||
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '*') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(0, -1);
|
||||
} else if (*pos == '+') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(1, -1);
|
||||
} else if (*pos == '?') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
handle_repetitions(0, 1);
|
||||
} else if (*pos == '{') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
|
||||
if (!is_digit_char(*pos)) {
|
||||
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||
}
|
||||
const char * int_end = parse_int(pos);
|
||||
int min_times = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
|
||||
int max_times = -1;
|
||||
|
||||
if (*pos == '}') {
|
||||
max_times = min_times;
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == ',') {
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
|
||||
if (is_digit_char(*pos)) {
|
||||
const char * int_end = parse_int(pos);
|
||||
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
}
|
||||
|
||||
if (*pos != '}') {
|
||||
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else {
|
||||
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
||||
}
|
||||
handle_repetitions(min_times, max_times);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
const char * parse_alternates(
|
||||
parse_state & state,
|
||||
const char * src,
|
||||
const std::string & rule_name,
|
||||
uint32_t rule_id,
|
||||
bool is_nested) {
|
||||
std::vector<llama_grammar_element> rule;
|
||||
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
|
||||
while (*pos == '|') {
|
||||
rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||
pos = parse_space(pos + 1, true);
|
||||
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
|
||||
}
|
||||
rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||
add_rule(state, rule_id, rule);
|
||||
return pos;
|
||||
}
|
||||
|
||||
static const char * parse_rule(parse_state & state, const char * src) {
|
||||
const char * name_end = parse_name(src);
|
||||
const char * pos = parse_space(name_end, false);
|
||||
size_t name_len = name_end - src;
|
||||
uint32_t rule_id = get_symbol_id(state, src, name_len);
|
||||
const std::string name(src, name_len);
|
||||
|
||||
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
|
||||
throw std::runtime_error(std::string("expecting ::= at ") + pos);
|
||||
}
|
||||
pos = parse_space(pos + 3, true);
|
||||
|
||||
pos = parse_alternates(state, pos, name, rule_id, false);
|
||||
|
||||
if (*pos == '\r') {
|
||||
pos += pos[1] == '\n' ? 2 : 1;
|
||||
} else if (*pos == '\n') {
|
||||
pos++;
|
||||
} else if (*pos) {
|
||||
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
|
||||
}
|
||||
return parse_space(pos, true);
|
||||
}
|
||||
|
||||
parse_state parse(const char * src) {
|
||||
try {
|
||||
parse_state state;
|
||||
const char * pos = parse_space(src, true);
|
||||
while (*pos) {
|
||||
pos = parse_rule(state, pos);
|
||||
}
|
||||
// Validate the state to ensure that all rules are defined
|
||||
for (const auto & rule : state.rules) {
|
||||
for (const auto & elem : rule) {
|
||||
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
||||
// Ensure that the rule at that location exists
|
||||
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
|
||||
// Get the name of the rule that is missing
|
||||
for (const auto & kv : state.symbol_ids) {
|
||||
if (kv.second == elem.value) {
|
||||
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return state;
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
|
||||
return parse_state();
|
||||
}
|
||||
}
|
||||
|
||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
||||
if (0x20 <= c && c <= 0x7f) {
|
||||
fprintf(file, "%c", static_cast<char>(c));
|
||||
} else {
|
||||
// cop out of encoding UTF-8
|
||||
fprintf(file, "<U+%04X>", c);
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_char_element(llama_grammar_element elem) {
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_CHAR: return true;
|
||||
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
||||
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
||||
case LLAMA_GRETYPE_CHAR_ANY: return true;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
||||
for (auto elem : rule) {
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
|
||||
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
|
||||
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
|
||||
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
|
||||
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
||||
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
||||
}
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_END:
|
||||
case LLAMA_GRETYPE_ALT:
|
||||
case LLAMA_GRETYPE_RULE_REF:
|
||||
fprintf(file, "(%u) ", elem.value);
|
||||
break;
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||
case LLAMA_GRETYPE_CHAR_ALT:
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
fprintf(file, "(\"");
|
||||
print_grammar_char(file, elem.value);
|
||||
fprintf(file, "\") ");
|
||||
break;
|
||||
}
|
||||
}
|
||||
fprintf(file, "\n");
|
||||
}
|
||||
|
||||
static void print_rule(
|
||||
FILE * file,
|
||||
uint32_t rule_id,
|
||||
const std::vector<llama_grammar_element> & rule,
|
||||
const std::map<uint32_t, std::string> & symbol_id_names) {
|
||||
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
|
||||
throw std::runtime_error(
|
||||
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
|
||||
}
|
||||
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
|
||||
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
|
||||
llama_grammar_element elem = rule[i];
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_END:
|
||||
throw std::runtime_error(
|
||||
"unexpected end of rule: " + std::to_string(rule_id) + "," +
|
||||
std::to_string(i));
|
||||
case LLAMA_GRETYPE_ALT:
|
||||
fprintf(file, "| ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_RULE_REF:
|
||||
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
|
||||
break;
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
fprintf(file, "[");
|
||||
print_grammar_char(file, elem.value);
|
||||
break;
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
fprintf(file, "[^");
|
||||
print_grammar_char(file, elem.value);
|
||||
break;
|
||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||
if (i == 0 || !is_char_element(rule[i - 1])) {
|
||||
throw std::runtime_error(
|
||||
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
|
||||
std::to_string(rule_id) + "," + std::to_string(i));
|
||||
}
|
||||
fprintf(file, "-");
|
||||
print_grammar_char(file, elem.value);
|
||||
break;
|
||||
case LLAMA_GRETYPE_CHAR_ALT:
|
||||
if (i == 0 || !is_char_element(rule[i - 1])) {
|
||||
throw std::runtime_error(
|
||||
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
|
||||
std::to_string(rule_id) + "," + std::to_string(i));
|
||||
}
|
||||
print_grammar_char(file, elem.value);
|
||||
break;
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
fprintf(file, ".");
|
||||
break;
|
||||
}
|
||||
if (is_char_element(elem)) {
|
||||
switch (rule[i + 1].type) {
|
||||
case LLAMA_GRETYPE_CHAR_ALT:
|
||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
break;
|
||||
default:
|
||||
fprintf(file, "] ");
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(file, "\n");
|
||||
}
|
||||
|
||||
void print_grammar(FILE * file, const parse_state & state) {
|
||||
try {
|
||||
std::map<uint32_t, std::string> symbol_id_names;
|
||||
for (const auto & kv : state.symbol_ids) {
|
||||
symbol_id_names[kv.second] = kv.first;
|
||||
}
|
||||
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
||||
// fprintf(file, "%zu: ", i);
|
||||
// print_rule_binary(file, state.rules[i]);
|
||||
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
|
||||
// fprintf(file, "\n");
|
||||
}
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<const llama_grammar_element *> parse_state::c_rules() {
|
||||
std::vector<const llama_grammar_element *> ret;
|
||||
ret.reserve(rules.size());
|
||||
for (const auto & rule : rules) {
|
||||
ret.push_back(rule.data());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
29
common/grammar-parser.h
Normal file
29
common/grammar-parser.h
Normal file
|
@ -0,0 +1,29 @@
|
|||
// Implements a parser for an extended Backus-Naur form (BNF), producing the
|
||||
// binary context-free grammar format specified by llama.h. Supports character
|
||||
// ranges, grouping, and repetition operators. As an example, a grammar for
|
||||
// arithmetic might look like:
|
||||
//
|
||||
// root ::= expr
|
||||
// expr ::= term ([-+*/] term)*
|
||||
// term ::= num | "(" space expr ")" space
|
||||
// num ::= [0-9]+ space
|
||||
// space ::= [ \t\n]*
|
||||
|
||||
#pragma once
|
||||
#include "llama.h"
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
namespace grammar_parser {
|
||||
struct parse_state {
|
||||
std::map<std::string, uint32_t> symbol_ids;
|
||||
std::vector<std::vector<llama_grammar_element>> rules;
|
||||
|
||||
std::vector<const llama_grammar_element *> c_rules();
|
||||
};
|
||||
|
||||
parse_state parse(const char * src);
|
||||
void print_grammar(FILE * file, const parse_state & state);
|
||||
}
|
|
@ -1,6 +1,4 @@
|
|||
#include "json-schema-to-grammar.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
|
@ -13,6 +11,11 @@
|
|||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
template <typename Iterator>
|
||||
static std::string join(Iterator begin, Iterator end, const std::string & separator);
|
||||
|
||||
static std::string repeat(const std::string & str, size_t n);
|
||||
|
||||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||
|
||||
|
@ -37,233 +40,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
|||
return result;
|
||||
}
|
||||
|
||||
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
||||
class string_view {
|
||||
const std::string & _str;
|
||||
const size_t _start;
|
||||
const size_t _end;
|
||||
public:
|
||||
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
||||
|
||||
size_t size() const {
|
||||
return _end - _start;
|
||||
}
|
||||
|
||||
size_t length() const {
|
||||
return size();
|
||||
}
|
||||
|
||||
operator std::string() const {
|
||||
return str();
|
||||
}
|
||||
|
||||
std::string str() const {
|
||||
return _str.substr(_start, _end - _start);
|
||||
}
|
||||
|
||||
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
||||
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
||||
}
|
||||
|
||||
char operator[](size_t pos) const {
|
||||
auto index = _start + pos;
|
||||
if (index >= _end) {
|
||||
throw std::out_of_range("string_view index out of range");
|
||||
}
|
||||
return _str[_start + pos];
|
||||
}
|
||||
|
||||
bool operator==(const string_view & other) const {
|
||||
std::string this_str = *this;
|
||||
std::string other_str = other;
|
||||
return this_str == other_str;
|
||||
}
|
||||
};
|
||||
|
||||
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
||||
auto has_min = min_value != std::numeric_limits<int>::min();
|
||||
auto has_max = max_value != std::numeric_limits<int>::max();
|
||||
|
||||
auto digit_range = [&](char from, char to) {
|
||||
out << "[";
|
||||
if (from == to) {
|
||||
out << from;
|
||||
} else {
|
||||
out << from << "-" << to;
|
||||
}
|
||||
out << "]";
|
||||
};
|
||||
auto more_digits = [&](int min_digits, int max_digits) {
|
||||
out << "[0-9]";
|
||||
if (min_digits == max_digits && min_digits == 1) {
|
||||
return;
|
||||
}
|
||||
out << "{";
|
||||
out << min_digits;
|
||||
if (max_digits != min_digits) {
|
||||
out << ",";
|
||||
if (max_digits != std::numeric_limits<int>::max()) {
|
||||
out << max_digits;
|
||||
}
|
||||
}
|
||||
out << "}";
|
||||
};
|
||||
std::function<void(const string_view &, const string_view &)> uniform_range =
|
||||
[&](const string_view & from, const string_view & to) {
|
||||
size_t i = 0;
|
||||
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
||||
i++;
|
||||
}
|
||||
if (i > 0) {
|
||||
out << "\"" << from.substr(0, i).str() << "\"";
|
||||
}
|
||||
if (i < from.length() && i < to.length()) {
|
||||
if (i > 0) {
|
||||
out << " ";
|
||||
}
|
||||
auto sub_len = from.length() - i - 1;
|
||||
if (sub_len > 0) {
|
||||
auto from_sub = from.substr(i + 1);
|
||||
auto to_sub = to.substr(i + 1);
|
||||
auto sub_zeros = string_repeat("0", sub_len);
|
||||
auto sub_nines = string_repeat("9", sub_len);
|
||||
|
||||
auto to_reached = false;
|
||||
out << "(";
|
||||
if (from_sub == sub_zeros) {
|
||||
digit_range(from[i], to[i] - 1);
|
||||
out << " ";
|
||||
more_digits(sub_len, sub_len);
|
||||
} else {
|
||||
out << "[" << from[i] << "] ";
|
||||
out << "(";
|
||||
uniform_range(from_sub, sub_nines);
|
||||
out << ")";
|
||||
if (from[i] < to[i] - 1) {
|
||||
out << " | ";
|
||||
if (to_sub == sub_nines) {
|
||||
digit_range(from[i] + 1, to[i]);
|
||||
to_reached = true;
|
||||
} else {
|
||||
digit_range(from[i] + 1, to[i] - 1);
|
||||
}
|
||||
out << " ";
|
||||
more_digits(sub_len, sub_len);
|
||||
}
|
||||
}
|
||||
if (!to_reached) {
|
||||
out << " | ";
|
||||
digit_range(to[i], to[i]);
|
||||
out << " ";
|
||||
uniform_range(sub_zeros, to_sub);
|
||||
}
|
||||
out << ")";
|
||||
} else {
|
||||
out << "[" << from[i] << "-" << to[i] << "]";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (has_min && has_max) {
|
||||
if (min_value < 0 && max_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
|
||||
out << ")";
|
||||
return;
|
||||
}
|
||||
|
||||
if (min_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
|
||||
out << ") | ";
|
||||
min_value = 0;
|
||||
}
|
||||
|
||||
auto min_s = std::to_string(min_value);
|
||||
auto max_s = std::to_string(max_value);
|
||||
auto min_digits = min_s.length();
|
||||
auto max_digits = max_s.length();
|
||||
|
||||
for (auto digits = min_digits; digits < max_digits; digits++) {
|
||||
uniform_range(min_s, string_repeat("9", digits));
|
||||
min_s = "1" + string_repeat("0", digits);
|
||||
out << " | ";
|
||||
}
|
||||
uniform_range(min_s, max_s);
|
||||
return;
|
||||
}
|
||||
|
||||
auto less_decimals = std::max(decimals_left - 1, 1);
|
||||
|
||||
if (has_min) {
|
||||
if (min_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
||||
out << ") | [0] | [1-9] ";
|
||||
more_digits(0, decimals_left - 1);
|
||||
} else if (min_value == 0) {
|
||||
if (top_level) {
|
||||
out << "[0] | [1-9] ";
|
||||
more_digits(0, less_decimals);
|
||||
} else {
|
||||
more_digits(1, decimals_left);
|
||||
}
|
||||
} else if (min_value <= 9) {
|
||||
char c = '0' + min_value;
|
||||
auto range_start = top_level ? '1' : '0';
|
||||
if (c > range_start) {
|
||||
digit_range(range_start, c - 1);
|
||||
out << " ";
|
||||
more_digits(1, less_decimals);
|
||||
out << " | ";
|
||||
}
|
||||
digit_range(c, '9');
|
||||
out << " ";
|
||||
more_digits(0, less_decimals);
|
||||
} else {
|
||||
auto min_s = std::to_string(min_value);
|
||||
auto len = min_s.length();
|
||||
auto c = min_s[0];
|
||||
|
||||
if (c > '1') {
|
||||
digit_range(top_level ? '1' : '0', c - 1);
|
||||
out << " ";
|
||||
more_digits(len, less_decimals);
|
||||
out << " | ";
|
||||
}
|
||||
digit_range(c, c);
|
||||
out << " (";
|
||||
_build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
|
||||
out << ")";
|
||||
if (c < '9') {
|
||||
out << " | ";
|
||||
digit_range(c + 1, '9');
|
||||
out << " ";
|
||||
more_digits(len - 1, less_decimals);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (has_max) {
|
||||
if (max_value >= 0) {
|
||||
if (top_level) {
|
||||
out << "\"-\" [1-9] ";
|
||||
more_digits(0, less_decimals);
|
||||
out << " | ";
|
||||
}
|
||||
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
||||
} else {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
|
||||
out << ")";
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw std::runtime_error("At least one of min_value or max_value must be set");
|
||||
}
|
||||
|
||||
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||
|
||||
struct BuiltinRule {
|
||||
|
@ -313,7 +89,50 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
|||
};
|
||||
|
||||
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||
|
||||
template <typename Iterator>
|
||||
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
||||
std::ostringstream result;
|
||||
if (begin != end) {
|
||||
result << *begin;
|
||||
for (Iterator it = begin + 1; it != end; ++it) {
|
||||
result << separator << *it;
|
||||
}
|
||||
}
|
||||
return result.str();
|
||||
}
|
||||
|
||||
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
size_t start = 0;
|
||||
size_t end = str.find(delimiter);
|
||||
|
||||
while (end != std::string::npos) {
|
||||
tokens.push_back(str.substr(start, end - start));
|
||||
start = end + delimiter.length();
|
||||
end = str.find(delimiter, start);
|
||||
}
|
||||
|
||||
tokens.push_back(str.substr(start));
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
static std::string repeat(const std::string & str, size_t n) {
|
||||
if (n == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string result;
|
||||
result.reserve(str.length() * n);
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
result += str;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
|
||||
std::smatch match;
|
||||
|
@ -341,9 +160,9 @@ static std::string format_literal(const std::string & literal) {
|
|||
return "\"" + escaped + "\"";
|
||||
}
|
||||
|
||||
|
||||
class SchemaConverter {
|
||||
private:
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
std::map<std::string, std::string> _rules;
|
||||
|
@ -373,7 +192,7 @@ private:
|
|||
for (size_t i = 0; i < alt_schemas.size(); i++) {
|
||||
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
|
||||
}
|
||||
return string_join(rules, " | ");
|
||||
return join(rules.begin(), rules.end(), " | ");
|
||||
}
|
||||
|
||||
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
|
||||
|
@ -436,7 +255,7 @@ private:
|
|||
for (const auto & item : ret) {
|
||||
results.push_back(to_rule(item));
|
||||
}
|
||||
return std::make_pair(string_join(results, " "), false);
|
||||
return std::make_pair(join(results.begin(), results.end(), " "), false);
|
||||
};
|
||||
|
||||
while (i < length) {
|
||||
|
@ -494,7 +313,7 @@ private:
|
|||
}
|
||||
curly_brackets += '}';
|
||||
i++;
|
||||
auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
|
||||
auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
|
||||
int min_times = 0;
|
||||
int max_times = std::numeric_limits<int>::max();
|
||||
try {
|
||||
|
@ -566,76 +385,7 @@ private:
|
|||
}
|
||||
return join_seq();
|
||||
};
|
||||
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
|
||||
}
|
||||
|
||||
/*
|
||||
Returns a rule that matches a JSON string that is none of the provided strings
|
||||
|
||||
not_strings({"a"})
|
||||
-> ["] ( [a] char+ | [^"a] char* )? ["] space
|
||||
not_strings({"and", "also"})
|
||||
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
|
||||
*/
|
||||
std::string _not_strings(const std::vector<std::string> & strings) {
|
||||
|
||||
struct TrieNode {
|
||||
std::map<char, TrieNode> children;
|
||||
bool is_end_of_string;
|
||||
|
||||
TrieNode() : is_end_of_string(false) {}
|
||||
|
||||
void insert(const std::string & string) {
|
||||
auto node = this;
|
||||
for (char c : string) {
|
||||
node = &node->children[c];
|
||||
}
|
||||
node->is_end_of_string = true;
|
||||
}
|
||||
};
|
||||
|
||||
TrieNode trie;
|
||||
for (const auto & s : strings) {
|
||||
trie.insert(s);
|
||||
}
|
||||
|
||||
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
|
||||
std::ostringstream out;
|
||||
out << "[\"] ( ";
|
||||
std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
|
||||
std::ostringstream rejects;
|
||||
auto first = true;
|
||||
for (const auto & kv : node.children) {
|
||||
rejects << kv.first;
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
out << " | ";
|
||||
}
|
||||
out << "[" << kv.first << "]";
|
||||
if (!kv.second.children.empty()) {
|
||||
out << " (";
|
||||
visit(kv.second);
|
||||
out << ")";
|
||||
} else if (kv.second.is_end_of_string) {
|
||||
out << " " << char_rule << "+";
|
||||
}
|
||||
}
|
||||
if (!node.children.empty()) {
|
||||
if (!first) {
|
||||
out << " | ";
|
||||
}
|
||||
out << "[^\"" << rejects.str() << "] " << char_rule << "*";
|
||||
}
|
||||
};
|
||||
visit(trie);
|
||||
|
||||
out << " )";
|
||||
if (!trie.is_end_of_string) {
|
||||
out << "?";
|
||||
}
|
||||
out << " [\"] space";
|
||||
return out.str();
|
||||
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
||||
}
|
||||
|
||||
std::string _resolve_ref(const std::string & ref) {
|
||||
|
@ -658,7 +408,6 @@ private:
|
|||
std::vector<std::string> required_props;
|
||||
std::vector<std::string> optional_props;
|
||||
std::unordered_map<std::string, std::string> prop_kv_rule_names;
|
||||
std::vector<std::string> prop_names;
|
||||
for (const auto & kv : properties) {
|
||||
const auto &prop_name = kv.first;
|
||||
const auto &prop_schema = kv.second;
|
||||
|
@ -673,18 +422,11 @@ private:
|
|||
} else {
|
||||
optional_props.push_back(prop_name);
|
||||
}
|
||||
prop_names.push_back(prop_name);
|
||||
}
|
||||
if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
|
||||
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
|
||||
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
||||
std::string value_rule =
|
||||
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
|
||||
: _add_primitive("value", PRIMITIVE_RULES.at("value"));
|
||||
|
||||
auto key_rule =
|
||||
prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
|
||||
: _add_rule(sub_name + "-k", _not_strings(prop_names));
|
||||
std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
|
||||
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
|
||||
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
|
||||
prop_kv_rule_names["*"] = kv_rule;
|
||||
optional_props.push_back("*");
|
||||
}
|
||||
|
@ -710,11 +452,15 @@ private:
|
|||
}
|
||||
std::string k = ks[0];
|
||||
std::string kv_rule_name = prop_kv_rule_names[k];
|
||||
std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
|
||||
if (first_is_optional) {
|
||||
res = comma_ref + (k == "*" ? "*" : "?");
|
||||
if (k == "*") {
|
||||
res = _add_rule(
|
||||
name + (name.empty() ? "" : "-") + "additional-kvs",
|
||||
kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
|
||||
);
|
||||
} else if (first_is_optional) {
|
||||
res = "( \",\" space " + kv_rule_name + " )?";
|
||||
} else {
|
||||
res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
|
||||
res = kv_rule_name;
|
||||
}
|
||||
if (ks.size() > 1) {
|
||||
res += " " + _add_rule(
|
||||
|
@ -764,11 +510,10 @@ private:
|
|||
public:
|
||||
SchemaConverter(
|
||||
const std::function<json(const std::string &)> & fetch_json,
|
||||
bool dotall,
|
||||
bool compact_spaces)
|
||||
bool dotall)
|
||||
: _fetch_json(fetch_json), _dotall(dotall)
|
||||
{
|
||||
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
|
||||
_rules["space"] = SPACE_RULE;
|
||||
}
|
||||
|
||||
void resolve_refs(json & schema, const std::string & url) {
|
||||
|
@ -810,7 +555,7 @@ public:
|
|||
return;
|
||||
}
|
||||
std::string pointer = ref.substr(ref.find('#') + 1);
|
||||
std::vector<std::string> tokens = string_split(pointer, "/");
|
||||
std::vector<std::string> tokens = split(pointer, "/");
|
||||
for (size_t i = 1; i < tokens.size(); ++i) {
|
||||
std::string sel = tokens[i];
|
||||
if (target.is_null() || !target.contains(sel)) {
|
||||
|
@ -849,19 +594,17 @@ public:
|
|||
} else if (schema_type.is_array()) {
|
||||
std::vector<json> schema_types;
|
||||
for (const auto & t : schema_type) {
|
||||
json schema_copy(schema);
|
||||
schema_copy["type"] = t;
|
||||
schema_types.push_back(schema_copy);
|
||||
schema_types.push_back({{"type", t}});
|
||||
}
|
||||
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
|
||||
} else if (schema.contains("const")) {
|
||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
|
||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
|
||||
} else if (schema.contains("enum")) {
|
||||
std::vector<std::string> enum_values;
|
||||
for (const auto & v : schema["enum"]) {
|
||||
enum_values.push_back(_generate_constant_rule(v));
|
||||
}
|
||||
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
|
||||
return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
|
||||
} else if ((schema_type.is_null() || schema_type == "object")
|
||||
&& (schema.contains("properties") ||
|
||||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
||||
|
@ -943,24 +686,6 @@ public:
|
|||
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
||||
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
||||
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
||||
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
||||
int min_value = std::numeric_limits<int>::min();
|
||||
int max_value = std::numeric_limits<int>::max();
|
||||
if (schema.contains("minimum")) {
|
||||
min_value = schema["minimum"].get<int>();
|
||||
} else if (schema.contains("exclusiveMinimum")) {
|
||||
min_value = schema["exclusiveMinimum"].get<int>() + 1;
|
||||
}
|
||||
if (schema.contains("maximum")) {
|
||||
max_value = schema["maximum"].get<int>();
|
||||
} else if (schema.contains("exclusiveMaximum")) {
|
||||
max_value = schema["exclusiveMaximum"].get<int>() - 1;
|
||||
}
|
||||
std::stringstream out;
|
||||
out << "(";
|
||||
_build_min_max_int(min_value, max_value, out);
|
||||
out << ") space";
|
||||
return _add_rule(rule_name, out.str());
|
||||
} else if (schema.empty() || schema_type == "object") {
|
||||
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
||||
} else {
|
||||
|
@ -975,10 +700,10 @@ public:
|
|||
|
||||
void check_errors() {
|
||||
if (!_errors.empty()) {
|
||||
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||
throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
|
||||
}
|
||||
if (!_warnings.empty()) {
|
||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -991,35 +716,11 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||
#ifdef LLAMA_USE_LLGUIDANCE
|
||||
if (!force_gbnf) {
|
||||
return "%llguidance {}\nstart: %json " + schema.dump();
|
||||
}
|
||||
#else
|
||||
(void)force_gbnf;
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
return build_grammar([&](const common_grammar_builder & callbacks) {
|
||||
std::string json_schema_to_grammar(const json & schema) {
|
||||
SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
|
||||
auto copy = schema;
|
||||
callbacks.resolve_refs(copy);
|
||||
callbacks.add_schema("", copy);
|
||||
});
|
||||
}
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
|
||||
common_grammar_builder builder {
|
||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||
return converter._add_rule(name, rule);
|
||||
},
|
||||
/* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
|
||||
return converter.visit(schema, name == "root" ? "" : name);
|
||||
},
|
||||
/* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
|
||||
converter.resolve_refs(schema, "");
|
||||
}
|
||||
};
|
||||
cb(builder);
|
||||
converter.resolve_refs(copy, "input");
|
||||
converter.visit(copy, "");
|
||||
converter.check_errors();
|
||||
return converter.format_grammar();
|
||||
}
|
||||
|
|
|
@ -5,18 +5,4 @@
|
|||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
|
||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
||||
bool force_gbnf = false);
|
||||
|
||||
struct common_grammar_builder {
|
||||
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
||||
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
||||
std::function<void(nlohmann::ordered_json &)> resolve_refs;
|
||||
};
|
||||
|
||||
struct common_grammar_options {
|
||||
bool dotall = false;
|
||||
bool compact_spaces = false;
|
||||
};
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
|
||||
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue