Merge remote-tracking branch 'origin/master' into tool-call
This commit is contained in:
commit
1afa31289d
493 changed files with 63724 additions and 57052 deletions
161
.clang-format
Normal file
161
.clang-format
Normal file
|
@ -0,0 +1,161 @@
|
||||||
|
---
|
||||||
|
Language: Cpp
|
||||||
|
AlignAfterOpenBracket: Align
|
||||||
|
AlignArrayOfStructures: Left
|
||||||
|
AlignConsecutiveAssignments: AcrossComments
|
||||||
|
AlignConsecutiveBitFields: AcrossComments
|
||||||
|
AlignConsecutiveDeclarations: AcrossComments
|
||||||
|
AlignConsecutiveMacros: AcrossComments
|
||||||
|
# AlignConsecutiveShortCaseStatements: AcrossComments
|
||||||
|
AlignEscapedNewlines: Left # LeftWithLastLine
|
||||||
|
AlignOperands: Align
|
||||||
|
AlignTrailingComments:
|
||||||
|
Kind: Always
|
||||||
|
OverEmptyLines: 1
|
||||||
|
AllowAllArgumentsOnNextLine: true
|
||||||
|
AllowAllParametersOfDeclarationOnNextLine: false
|
||||||
|
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
|
||||||
|
AllowShortBlocksOnASingleLine: Never
|
||||||
|
AllowShortCaseLabelsOnASingleLine: false
|
||||||
|
AllowShortFunctionsOnASingleLine: Inline
|
||||||
|
AllowShortIfStatementsOnASingleLine: Never
|
||||||
|
AllowShortLambdasOnASingleLine: Inline
|
||||||
|
AllowShortLoopsOnASingleLine: false
|
||||||
|
AlwaysBreakBeforeMultilineStrings: true
|
||||||
|
BinPackArguments: true
|
||||||
|
BinPackParameters: true # OnePerLine
|
||||||
|
BitFieldColonSpacing: Both
|
||||||
|
BreakBeforeBraces: Custom # Attach
|
||||||
|
BraceWrapping:
|
||||||
|
AfterCaseLabel: true
|
||||||
|
AfterClass: false
|
||||||
|
AfterControlStatement: false
|
||||||
|
AfterEnum: false
|
||||||
|
AfterFunction: false
|
||||||
|
AfterNamespace: false
|
||||||
|
AfterObjCDeclaration: false
|
||||||
|
AfterStruct: false
|
||||||
|
AfterUnion: false
|
||||||
|
AfterExternBlock: false
|
||||||
|
BeforeCatch: false
|
||||||
|
BeforeElse: false
|
||||||
|
BeforeLambdaBody: false
|
||||||
|
BeforeWhile: false
|
||||||
|
IndentBraces: false
|
||||||
|
SplitEmptyFunction: false
|
||||||
|
SplitEmptyRecord: false
|
||||||
|
SplitEmptyNamespace: false
|
||||||
|
# BreakAdjacentStringLiterals: true
|
||||||
|
BreakAfterAttributes: Never
|
||||||
|
BreakBeforeBinaryOperators: None
|
||||||
|
BreakBeforeInlineASMColon: OnlyMultiline
|
||||||
|
BreakBeforeTernaryOperators: false
|
||||||
|
# BreakBinaryOperations: Never
|
||||||
|
BreakConstructorInitializers: AfterColon
|
||||||
|
# BreakFunctionDefinitionParameters: false
|
||||||
|
BreakInheritanceList: AfterComma
|
||||||
|
BreakStringLiterals: true
|
||||||
|
# BreakTemplateDeclarations: Yes
|
||||||
|
ColumnLimit: 120
|
||||||
|
CommentPragmas: '^ IWYU pragma:'
|
||||||
|
CompactNamespaces: false
|
||||||
|
ConstructorInitializerIndentWidth: 4
|
||||||
|
ContinuationIndentWidth: 4
|
||||||
|
Cpp11BracedListStyle: false
|
||||||
|
DerivePointerAlignment: false
|
||||||
|
DisableFormat: false
|
||||||
|
EmptyLineBeforeAccessModifier: Leave
|
||||||
|
EmptyLineAfterAccessModifier: Never
|
||||||
|
ExperimentalAutoDetectBinPacking: false
|
||||||
|
FixNamespaceComments: true
|
||||||
|
IncludeBlocks: Regroup
|
||||||
|
IncludeCategories:
|
||||||
|
- Regex: '^<.*\.h>'
|
||||||
|
Priority: 1
|
||||||
|
SortPriority: 0
|
||||||
|
- Regex: '^<.*'
|
||||||
|
Priority: 2
|
||||||
|
SortPriority: 0
|
||||||
|
- Regex: '.*'
|
||||||
|
Priority: 3
|
||||||
|
SortPriority: 0
|
||||||
|
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||||
|
IncludeIsMainSourceRegex: ''
|
||||||
|
IndentAccessModifiers: false
|
||||||
|
IndentCaseBlocks: true
|
||||||
|
IndentCaseLabels: true
|
||||||
|
IndentExternBlock: NoIndent
|
||||||
|
IndentGotoLabels: false
|
||||||
|
IndentPPDirectives: AfterHash
|
||||||
|
IndentWidth: 4
|
||||||
|
IndentWrappedFunctionNames: false
|
||||||
|
InsertBraces: true # NOTE: may lead to incorrect formatting
|
||||||
|
InsertNewlineAtEOF: true
|
||||||
|
JavaScriptQuotes: Leave
|
||||||
|
JavaScriptWrapImports: true
|
||||||
|
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||||
|
LambdaBodyIndentation: Signature
|
||||||
|
LineEnding: LF
|
||||||
|
MacroBlockBegin: ''
|
||||||
|
MacroBlockEnd: ''
|
||||||
|
MaxEmptyLinesToKeep: 1
|
||||||
|
NamespaceIndentation: None
|
||||||
|
ObjCBinPackProtocolList: Auto
|
||||||
|
ObjCBlockIndentWidth: 4
|
||||||
|
ObjCSpaceAfterProperty: true
|
||||||
|
ObjCSpaceBeforeProtocolList: true
|
||||||
|
PPIndentWidth: -1
|
||||||
|
PackConstructorInitializers: CurrentLine
|
||||||
|
PenaltyBreakAssignment: 2
|
||||||
|
PenaltyBreakBeforeFirstCallParameter: 1
|
||||||
|
PenaltyBreakComment: 300
|
||||||
|
PenaltyBreakFirstLessLess: 120
|
||||||
|
PenaltyBreakString: 1000
|
||||||
|
PenaltyBreakTemplateDeclaration: 10
|
||||||
|
PenaltyExcessCharacter: 1000000
|
||||||
|
PenaltyReturnTypeOnItsOwnLine: 200
|
||||||
|
PointerAlignment: Middle
|
||||||
|
QualifierAlignment: Left
|
||||||
|
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
|
||||||
|
RawStringFormats:
|
||||||
|
- Language: Cpp
|
||||||
|
Delimiters:
|
||||||
|
- cc
|
||||||
|
- CC
|
||||||
|
- cpp
|
||||||
|
- Cpp
|
||||||
|
- CPP
|
||||||
|
- 'c++'
|
||||||
|
- 'C++'
|
||||||
|
CanonicalDelimiter: ''
|
||||||
|
ReferenceAlignment: Middle
|
||||||
|
ReflowComments: false # IndentOnly
|
||||||
|
SeparateDefinitionBlocks: Always
|
||||||
|
SortIncludes: CaseInsensitive
|
||||||
|
SortUsingDeclarations: LexicographicNumeric
|
||||||
|
SpaceAfterCStyleCast: true
|
||||||
|
SpaceAfterLogicalNot: false
|
||||||
|
SpaceAfterTemplateKeyword: true
|
||||||
|
SpaceBeforeAssignmentOperators: true
|
||||||
|
SpaceBeforeCpp11BracedList: false
|
||||||
|
SpaceBeforeCtorInitializerColon: true
|
||||||
|
SpaceBeforeInheritanceColon: true
|
||||||
|
SpaceBeforeParens: ControlStatements
|
||||||
|
SpaceBeforeRangeBasedForLoopColon: true
|
||||||
|
SpaceInEmptyBlock: false
|
||||||
|
SpaceInEmptyParentheses: false
|
||||||
|
SpacesBeforeTrailingComments: 2
|
||||||
|
SpacesInAngles: Never
|
||||||
|
SpacesInContainerLiterals: true
|
||||||
|
SpacesInLineCommentPrefix:
|
||||||
|
Minimum: 1
|
||||||
|
Maximum: -1
|
||||||
|
SpacesInParentheses: false
|
||||||
|
SpacesInSquareBrackets: false
|
||||||
|
SpaceBeforeSquareBrackets: false
|
||||||
|
Standard: c++17
|
||||||
|
TabWidth: 4
|
||||||
|
UseTab: Never
|
||||||
|
WhitespaceSensitiveMacros: ['STRINGIZE']
|
||||||
|
...
|
||||||
|
|
|
@ -17,8 +17,10 @@ Checks: >
|
||||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||||
performance-*,
|
performance-*,
|
||||||
portability-*,
|
portability-*,
|
||||||
|
-portability-simd-intrinsics,
|
||||||
misc-*,
|
misc-*,
|
||||||
-misc-const-correctness,
|
-misc-const-correctness,
|
||||||
-misc-non-private-member-variables-in-classes,
|
-misc-non-private-member-variables-in-classes,
|
||||||
-misc-no-recursion,
|
-misc-no-recursion,
|
||||||
|
-misc-use-anonymous-namespace,
|
||||||
FormatStyle: none
|
FormatStyle: none
|
||||||
|
|
|
@ -26,7 +26,7 @@ COPY . .
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release -j$(nproc) && \
|
cmake --build build --config Release -j$(nproc) && \
|
||||||
cp build/bin/* .
|
cp build/bin/* .
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,9 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# MUSA architecture to build for (defaults to all supported archs)
|
||||||
|
ARG MUSA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
@ -19,7 +22,11 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
# Use the default MUSA archs if not specified
|
||||||
|
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
|
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release -j$(nproc) && \
|
cmake --build build --config Release -j$(nproc) && \
|
||||||
cp build/bin/* .
|
cp build/bin/* .
|
||||||
|
|
||||||
|
|
|
@ -3,23 +3,36 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||||
|
cmake --build build -j $(nproc) && \
|
||||||
|
mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib/ \;
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
COPY requirements.txt /app/requirements.txt
|
||||||
|
COPY requirements /app/requirements
|
||||||
|
COPY .devops/tools.sh /app/tools.sh
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel && \
|
||||||
|
pip install -r /app/requirements.txt
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/ /app/
|
||||||
|
COPY --from=build /app/lib/ /app/
|
||||||
|
COPY --from=build /app/convert_hf_to_gguf.py /app/
|
||||||
|
COPY --from=build /app/gguf-py /app/gguf-py
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
|
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
|
||||||
|
|
||||||
FROM cosdt/cann:$ASCEND_VERSION AS build
|
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
@ -22,11 +22,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
RUN echo "Building with static libs" && \
|
RUN echo "Building with static libs" && \
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||||
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
# TODO: use image with NNRT
|
# TODO: use image with NNRT
|
||||||
FROM cosdt/cann:$ASCEND_VERSION AS runtime
|
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
|
@ -22,16 +22,17 @@ COPY . .
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
||||||
|
mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
COPY --from=build /app/lib/ /
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
COPY --from=build /app/build/bin/llama-cli /
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
echo "Building with static libs" && \
|
echo "Building with static libs" && \
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
||||||
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# MUSA architecture to build for (defaults to all supported archs)
|
||||||
|
ARG MUSA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git cmake
|
apt-get install -y build-essential git cmake
|
||||||
|
|
||||||
|
@ -15,16 +18,21 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
# Use the default MUSA archs if not specified
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
|
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
||||||
|
mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
COPY --from=build /app/lib/ /
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
||||||
|
|
|
@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
# Build it
|
# Build it
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DGGML_VULKAN=1 && \
|
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
|
|
|
@ -3,21 +3,27 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||||
|
cmake --build build -j $(nproc) && \
|
||||||
|
mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib/ \;
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
WORKDIR /app
|
||||||
apt-get install -y libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/llama-cli /llama-cli
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /app/
|
||||||
|
COPY --from=build /app/lib/ /app/
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
|
@ -22,16 +22,17 @@ COPY . .
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
cmake --build build --config Release --target llama-server -j$(nproc) && \
|
||||||
|
mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
COPY --from=build /app/lib/ /
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
echo "Building with dynamic libs" && \
|
echo "Building with dynamic libs" && \
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target llama-server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||||
|
|
|
@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
# MUSA architecture to build for (defaults to all supported archs)
|
||||||
|
ARG MUSA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
|
@ -15,16 +18,21 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
# Use the default MUSA archs if not specified
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
|
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release --target llama-server -j$(nproc) && \
|
||||||
|
mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
COPY --from=build /app/lib/ /
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
|
|
@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
# Build it
|
# Build it
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
cmake --build build --config Release --target llama-server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
|
|
|
@ -3,22 +3,26 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||||
|
cmake --build build -j $(nproc) && \
|
||||||
RUN make -j$(nproc) llama-server
|
mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib/ \;
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /app/
|
||||||
|
COPY --from=build /app/lib/ /app/
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
@ -26,4 +30,4 @@ ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/app/llama-server" ]
|
||||||
|
|
|
@ -126,9 +126,9 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
};
|
};
|
||||||
|
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml/src/ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
substituteInPlace ./ggml/src/ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
@ -173,7 +173,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
(cmakeBool "GGML_HIP" useRocm)
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||||
(cmakeBool "GGML_STATIC" enableStatic)
|
(cmakeBool "GGML_STATIC" enableStatic)
|
||||||
|
|
|
@ -34,7 +34,7 @@ let
|
||||||
|
|
||||||
# server tests
|
# server tests
|
||||||
openai
|
openai
|
||||||
behave
|
pytest
|
||||||
prometheus-client
|
prometheus-client
|
||||||
];
|
];
|
||||||
in
|
in
|
||||||
|
|
|
@ -24,6 +24,16 @@ insert_final_newline = unset
|
||||||
[examples/server/public/*]
|
[examples/server/public/*]
|
||||||
indent_size = 2
|
indent_size = 2
|
||||||
|
|
||||||
|
[examples/server/public/deps_*]
|
||||||
|
trim_trailing_whitespace = unset
|
||||||
|
indent_style = unset
|
||||||
|
indent_size = unset
|
||||||
|
|
||||||
|
[examples/server/deps_*]
|
||||||
|
trim_trailing_whitespace = unset
|
||||||
|
indent_style = unset
|
||||||
|
indent_size = unset
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
|
|
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
|
@ -1,50 +0,0 @@
|
||||||
name: Low Severity Bugs
|
|
||||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
|
||||||
title: "Bug: "
|
|
||||||
labels: ["bug-unconfirmed", "low severity"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
|
||||||
and the version of llama.cpp that you are using.
|
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
- type: textarea
|
|
||||||
id: what-happened
|
|
||||||
attributes:
|
|
||||||
label: What happened?
|
|
||||||
description: Also tell us, what did you expect to happen?
|
|
||||||
placeholder: Tell us what you see!
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: What operating system are you seeing the problem on?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
77
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
Normal file
77
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
name: Bug (compilation)
|
||||||
|
description: Something goes wrong when trying to compile llama.cpp.
|
||||||
|
title: "Compile bug: "
|
||||||
|
labels: ["bug-unconfirmed", "compilation"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
This issue template is intended for bug reports where the compilation of llama.cpp fails.
|
||||||
|
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
|
||||||
|
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
||||||
|
by clearing `~/.cache/ccache` (on Linux).
|
||||||
|
- type: textarea
|
||||||
|
id: commit
|
||||||
|
attributes:
|
||||||
|
label: Git commit
|
||||||
|
description: Which commit are you trying to compile?
|
||||||
|
placeholder: |
|
||||||
|
$git rev-parse HEAD
|
||||||
|
84a07a17b1b08cf2b9747c633a2372782848a27f
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: Operating systems
|
||||||
|
description: Which operating systems do you know to be affected?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: backends
|
||||||
|
attributes:
|
||||||
|
label: GGML backends
|
||||||
|
description: Which GGML backends do you know to be affected?
|
||||||
|
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||||
|
multiple: true
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: info
|
||||||
|
attributes:
|
||||||
|
label: Problem description & steps to reproduce
|
||||||
|
description: >
|
||||||
|
Please give us a summary of the problem and tell us how to reproduce it.
|
||||||
|
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
|
||||||
|
placeholder: >
|
||||||
|
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
|
||||||
|
Here are the exact commands that I used: ...
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: first_bad_commit
|
||||||
|
attributes:
|
||||||
|
label: First Bad Commit
|
||||||
|
description: >
|
||||||
|
If the bug was not present on an earlier version: when did it start appearing?
|
||||||
|
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: >
|
||||||
|
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||||
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
||||||
|
validations:
|
||||||
|
required: true
|
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
Normal file
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
name: Bug (model use)
|
||||||
|
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
|
||||||
|
title: "Eval bug: "
|
||||||
|
labels: ["bug-unconfirmed", "model evaluation"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
This issue template is intended for bug reports where the model evaluation results
|
||||||
|
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
||||||
|
If you encountered the issue while using an external UI (e.g. ollama),
|
||||||
|
please reproduce your issue using one of the examples/binaries in this repository.
|
||||||
|
The `llama-cli` binary can be used for simple and reproducible model inference.
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: Operating systems
|
||||||
|
description: Which operating systems do you know to be affected?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: backends
|
||||||
|
attributes:
|
||||||
|
label: GGML backends
|
||||||
|
description: Which GGML backends do you know to be affected?
|
||||||
|
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||||
|
multiple: true
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: hardware
|
||||||
|
attributes:
|
||||||
|
label: Hardware
|
||||||
|
description: Which CPUs/GPUs are you using?
|
||||||
|
placeholder: >
|
||||||
|
e.g. Ryzen 5950X + 2x RTX 4090
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: model
|
||||||
|
attributes:
|
||||||
|
label: Models
|
||||||
|
description: >
|
||||||
|
Which model(s) at which quantization were you using when encountering the bug?
|
||||||
|
If you downloaded a GGUF file off of Huggingface, please provide a link.
|
||||||
|
placeholder: >
|
||||||
|
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: info
|
||||||
|
attributes:
|
||||||
|
label: Problem description & steps to reproduce
|
||||||
|
description: >
|
||||||
|
Please give us a summary of the problem and tell us how to reproduce it.
|
||||||
|
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
||||||
|
that information would be very much appreciated by us.
|
||||||
|
placeholder: >
|
||||||
|
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
||||||
|
When I use -ngl 0 it works correctly.
|
||||||
|
Here are the exact commands that I used: ...
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: first_bad_commit
|
||||||
|
attributes:
|
||||||
|
label: First Bad Commit
|
||||||
|
description: >
|
||||||
|
If the bug was not present on an earlier version: when did it start appearing?
|
||||||
|
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: >
|
||||||
|
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||||
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
||||||
|
validations:
|
||||||
|
required: true
|
81
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
Normal file
81
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
name: Bug (misc.)
|
||||||
|
description: Something is not working the way it should (and it's not covered by any of the above cases).
|
||||||
|
title: "Misc. bug: "
|
||||||
|
labels: ["bug-unconfirmed"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
This issue template is intended for miscellaneous bugs that don't fit into any other category.
|
||||||
|
If you encountered the issue while using an external UI (e.g. ollama),
|
||||||
|
please reproduce your issue using one of the examples/binaries in this repository.
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which version of our software is affected? (You can use `--version` to get a version string.)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: Operating systems
|
||||||
|
description: Which operating systems do you know to be affected?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: dropdown
|
||||||
|
id: module
|
||||||
|
attributes:
|
||||||
|
label: Which llama.cpp modules do you know to be affected?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Documentation/Github
|
||||||
|
- libllama (core library)
|
||||||
|
- llama-cli
|
||||||
|
- llama-server
|
||||||
|
- llama-bench
|
||||||
|
- llama-quantize
|
||||||
|
- Python/Bash scripts
|
||||||
|
- Test code
|
||||||
|
- Other (Please specify in the next section)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: info
|
||||||
|
attributes:
|
||||||
|
label: Problem description & steps to reproduce
|
||||||
|
description: >
|
||||||
|
Please give us a summary of the problem and tell us how to reproduce it (if applicable).
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: first_bad_commit
|
||||||
|
attributes:
|
||||||
|
label: First Bad Commit
|
||||||
|
description: >
|
||||||
|
If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
|
||||||
|
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: >
|
||||||
|
If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||||
|
This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
||||||
|
validations:
|
||||||
|
required: false
|
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
|
@ -1,50 +0,0 @@
|
||||||
name: Medium Severity Bug
|
|
||||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
|
||||||
title: "Bug: "
|
|
||||||
labels: ["bug-unconfirmed", "medium severity"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
|
||||||
and the version of llama.cpp that you are using.
|
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
- type: textarea
|
|
||||||
id: what-happened
|
|
||||||
attributes:
|
|
||||||
label: What happened?
|
|
||||||
description: Also tell us, what did you expect to happen?
|
|
||||||
placeholder: Tell us what you see!
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: What operating system are you seeing the problem on?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
|
@ -1,5 +1,5 @@
|
||||||
name: Enhancement
|
name: Enhancement
|
||||||
description: Used to request enhancements for llama.cpp
|
description: Used to request enhancements for llama.cpp.
|
||||||
title: "Feature Request: "
|
title: "Feature Request: "
|
||||||
labels: ["enhancement"]
|
labels: ["enhancement"]
|
||||||
body:
|
body:
|
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
|
@ -1,50 +0,0 @@
|
||||||
name: High Severity Bug
|
|
||||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
|
||||||
title: "Bug: "
|
|
||||||
labels: ["bug-unconfirmed", "high severity"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
|
||||||
and the version of llama.cpp that you are using.
|
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
- type: textarea
|
|
||||||
id: what-happened
|
|
||||||
attributes:
|
|
||||||
label: What happened?
|
|
||||||
description: Also tell us, what did you expect to happen?
|
|
||||||
placeholder: Tell us what you see!
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: What operating system are you seeing the problem on?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
|
@ -1,5 +1,5 @@
|
||||||
name: Research
|
name: Research
|
||||||
description: Track new technical research area
|
description: Track new technical research area.
|
||||||
title: "Research: "
|
title: "Research: "
|
||||||
labels: ["research 🔬"]
|
labels: ["research 🔬"]
|
||||||
body:
|
body:
|
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
|
@ -1,50 +0,0 @@
|
||||||
name: Critical Severity Bug
|
|
||||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
|
||||||
title: "Bug: "
|
|
||||||
labels: ["bug-unconfirmed", "critical severity"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
|
||||||
and the version of llama.cpp that you are using.
|
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
- type: textarea
|
|
||||||
id: what-happened
|
|
||||||
attributes:
|
|
||||||
label: What happened?
|
|
||||||
description: Also tell us, what did you expect to happen?
|
|
||||||
placeholder: Tell us what you see!
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: What operating system are you seeing the problem on?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
|
@ -1,5 +1,5 @@
|
||||||
name: Refactor (Maintainers)
|
name: Refactor (Maintainers)
|
||||||
description: Used to track refactoring opportunities
|
description: Used to track refactoring opportunities.
|
||||||
title: "Refactor: "
|
title: "Refactor: "
|
||||||
labels: ["refactor"]
|
labels: ["refactor"]
|
||||||
body:
|
body:
|
15
.github/labeler.yml
vendored
15
.github/labeler.yml
vendored
|
@ -3,19 +3,18 @@ Kompute:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml-kompute.h
|
- ggml/include/ggml-kompute.h
|
||||||
- ggml/src/ggml-kompute.cpp
|
- ggml/src/ggml-kompute/**
|
||||||
- README-kompute.md
|
- README-kompute.md
|
||||||
Apple Metal:
|
Apple Metal:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml-metal.h
|
- ggml/include/ggml-metal.h
|
||||||
- ggml/src/ggml-metal.cpp
|
- ggml/src/ggml-metal/**
|
||||||
- README-metal.md
|
- README-metal.md
|
||||||
SYCL:
|
SYCL:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml-sycl.h
|
- ggml/include/ggml-sycl.h
|
||||||
- ggml/src/ggml-sycl.cpp
|
|
||||||
- ggml/src/ggml-sycl/**
|
- ggml/src/ggml-sycl/**
|
||||||
- docs/backend/SYCL.md
|
- docs/backend/SYCL.md
|
||||||
- examples/sycl/**
|
- examples/sycl/**
|
||||||
|
@ -27,8 +26,8 @@ Nvidia GPU:
|
||||||
Vulkan:
|
Vulkan:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/ggml_vk_generate_shaders.py
|
- ggml/include/ggml-vulkan.h
|
||||||
- ggml/src/ggml-vulkan*
|
- ggml/src/ggml-vulkan/**
|
||||||
documentation:
|
documentation:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
@ -75,11 +74,7 @@ server:
|
||||||
ggml:
|
ggml:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml*.h
|
- ggml/**
|
||||||
- ggml/src/ggml*.c
|
|
||||||
- ggml/src/ggml*.cpp
|
|
||||||
- ggml/src/ggml*.h
|
|
||||||
- ggml-cuda/**
|
|
||||||
nix:
|
nix:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
|
8
.github/pull_request_template.md
vendored
8
.github/pull_request_template.md
vendored
|
@ -1,7 +1 @@
|
||||||
|
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
|
||||||
|
|
||||||
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
|
||||||
- Self-reported review complexity:
|
|
||||||
- [ ] Low
|
|
||||||
- [ ] Medium
|
|
||||||
- [ ] High
|
|
||||||
|
|
363
.github/workflows/build.yml
vendored
363
.github/workflows/build.yml
vendored
|
@ -55,7 +55,13 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
|
cmake .. \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DLLAMA_CURL=ON \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
|
-DGGML_RPC=ON \
|
||||||
|
-DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -92,7 +98,7 @@ jobs:
|
||||||
name: llama-bin-macos-arm64.zip
|
name: llama-bin-macos-arm64.zip
|
||||||
|
|
||||||
macOS-latest-cmake-x64:
|
macOS-latest-cmake-x64:
|
||||||
runs-on: macos-12
|
runs-on: macos-13
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -113,7 +119,12 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
cmake -B build \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DLLAMA_CURL=ON \
|
||||||
|
-DGGML_METAL=OFF \
|
||||||
|
-DGGML_RPC=ON \
|
||||||
|
-DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -149,66 +160,6 @@ jobs:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||||
name: llama-bin-macos-x64.zip
|
name: llama-bin-macos-x64.zip
|
||||||
|
|
||||||
ubuntu-focal-make:
|
|
||||||
runs-on: ubuntu-20.04
|
|
||||||
env:
|
|
||||||
LLAMA_NODE_AVAILABLE: true
|
|
||||||
LLAMA_PYTHON_AVAILABLE: true
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install build-essential gcc-8
|
|
||||||
|
|
||||||
- uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "20"
|
|
||||||
|
|
||||||
- uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: make_build
|
|
||||||
env:
|
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
|
||||||
run: |
|
|
||||||
CC=gcc-8 make -j $(nproc)
|
|
||||||
|
|
||||||
- name: Test
|
|
||||||
id: make_test
|
|
||||||
run: |
|
|
||||||
CC=gcc-8 make tests -j $(nproc)
|
|
||||||
make test -j $(nproc)
|
|
||||||
|
|
||||||
ubuntu-focal-make-curl:
|
|
||||||
runs-on: ubuntu-20.04
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: make_build
|
|
||||||
env:
|
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
|
||||||
LLAMA_CURL: 1
|
|
||||||
run: |
|
|
||||||
CC=gcc-8 make -j $(nproc)
|
|
||||||
|
|
||||||
ubuntu-latest-cmake:
|
ubuntu-latest-cmake:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
@ -394,15 +345,36 @@ jobs:
|
||||||
- name: Build with native CMake HIP support
|
- name: Build with native CMake HIP support
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
|
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Build with legacy HIP support
|
- name: Build with legacy HIP support
|
||||||
id: cmake_build_legacy_hip
|
id: cmake_build_legacy_hip
|
||||||
run: |
|
run: |
|
||||||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
|
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
|
||||||
cmake --build build2 --config Release -j $(nproc)
|
cmake --build build2 --config Release -j $(nproc)
|
||||||
|
|
||||||
|
ubuntu-22-cmake-musa:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
|
- name: Build with native CMake MUSA support
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build -S . -DGGML_MUSA=ON
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl:
|
ubuntu-22-cmake-sycl:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
@ -485,36 +457,6 @@ jobs:
|
||||||
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
|
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
|
||||||
# how to debug it.
|
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
|
||||||
macOS-latest-make:
|
|
||||||
runs-on: macos-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: make_build
|
|
||||||
env:
|
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
|
||||||
run: |
|
|
||||||
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
|
||||||
|
|
||||||
- name: Test
|
|
||||||
id: make_test
|
|
||||||
run: |
|
|
||||||
GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
|
||||||
GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
|
||||||
|
|
||||||
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||||
|
@ -569,6 +511,7 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
@ -599,6 +542,7 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
@ -608,33 +552,35 @@ jobs:
|
||||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
macOS-latest-swift:
|
# TODO: tmp disabled. see for possible re-enable:
|
||||||
runs-on: macos-latest
|
# https://github.com/ggerganov/llama.cpp/pull/10525
|
||||||
|
# macOS-latest-swift:
|
||||||
strategy:
|
# runs-on: macos-latest
|
||||||
matrix:
|
#
|
||||||
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
# strategy:
|
||||||
|
# matrix:
|
||||||
steps:
|
# destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||||
- name: Clone
|
#
|
||||||
id: checkout
|
# steps:
|
||||||
uses: actions/checkout@v4
|
# - name: Clone
|
||||||
|
# id: checkout
|
||||||
- name: Dependencies
|
# uses: actions/checkout@v4
|
||||||
id: depends
|
#
|
||||||
continue-on-error: true
|
# - name: Dependencies
|
||||||
run: |
|
# id: depends
|
||||||
brew update
|
# continue-on-error: true
|
||||||
|
# run: |
|
||||||
- name: xcodebuild for swift package
|
# brew update
|
||||||
id: xcodebuild
|
#
|
||||||
run: |
|
# - name: xcodebuild for swift package
|
||||||
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
# id: xcodebuild
|
||||||
|
# run: |
|
||||||
- name: Build Swift Example
|
# xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
||||||
id: make_build_swift_example
|
#
|
||||||
run: |
|
# - name: Build Swift Example
|
||||||
make swift
|
# id: make_build_swift_example
|
||||||
|
# run: |
|
||||||
|
# make swift
|
||||||
|
|
||||||
windows-msys2:
|
windows-msys2:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
@ -661,21 +607,6 @@ jobs:
|
||||||
mingw-w64-${{matrix.env}}-cmake
|
mingw-w64-${{matrix.env}}-cmake
|
||||||
mingw-w64-${{matrix.env}}-openblas
|
mingw-w64-${{matrix.env}}-openblas
|
||||||
|
|
||||||
- name: Build using make
|
|
||||||
shell: msys2 {0}
|
|
||||||
run: |
|
|
||||||
make -j $(nproc)
|
|
||||||
|
|
||||||
- name: Clean after building using make
|
|
||||||
shell: msys2 {0}
|
|
||||||
run: |
|
|
||||||
make clean
|
|
||||||
|
|
||||||
- name: Build using make w/ OpenBLAS
|
|
||||||
shell: msys2 {0}
|
|
||||||
run: |
|
|
||||||
make GGML_OPENBLAS=1 -j $(nproc)
|
|
||||||
|
|
||||||
- name: Build using CMake
|
- name: Build using CMake
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
run: |
|
run: |
|
||||||
|
@ -694,7 +625,7 @@ jobs:
|
||||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-2019
|
runs-on: windows-latest
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OPENBLAS_VERSION: 0.3.23
|
OPENBLAS_VERSION: 0.3.23
|
||||||
|
@ -734,7 +665,7 @@ jobs:
|
||||||
id: clone_kompute
|
id: clone_kompute
|
||||||
if: ${{ matrix.build == 'kompute-x64' }}
|
if: ${{ matrix.build == 'kompute-x64' }}
|
||||||
run: |
|
run: |
|
||||||
git submodule update --init ggml/src/kompute
|
git submodule update --init ggml/src/ggml-kompute/kompute
|
||||||
|
|
||||||
- name: Download OpenBLAS
|
- name: Download OpenBLAS
|
||||||
id: get_openblas
|
id: get_openblas
|
||||||
|
@ -837,12 +768,33 @@ jobs:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||||
name: llama-bin-win-${{ matrix.build }}.zip
|
name: llama-bin-win-${{ matrix.build }}.zip
|
||||||
|
|
||||||
windows-latest-cmake-cuda:
|
ubuntu-latest-cmake-cuda:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
env:
|
||||||
|
DEBIAN_FRONTEND: noninteractive
|
||||||
|
run: |
|
||||||
|
apt update
|
||||||
|
apt install -y cmake build-essential ninja-build libgomp1 git
|
||||||
|
|
||||||
|
- name: Build with CMake
|
||||||
|
run: |
|
||||||
|
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
|
||||||
|
cmake --build build
|
||||||
|
|
||||||
|
windows-2019-cmake-cuda:
|
||||||
runs-on: windows-2019
|
runs-on: windows-2019
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
cuda: ['12.2.0', '11.7.1']
|
cuda: ['12.4', '11.7']
|
||||||
build: ['cuda']
|
build: ['cuda']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
@ -852,22 +804,81 @@ jobs:
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Install CUDA toolkit
|
- name: Install Cuda Toolkit 11.7
|
||||||
id: cuda-toolkit
|
if: ${{ matrix.cuda == '11.7' }}
|
||||||
uses: Jimver/cuda-toolkit@v0.2.15
|
run: |
|
||||||
|
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||||
|
choco install unzip -y
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||||
|
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
|
||||||
|
- name: Install Cuda Toolkit 12.4
|
||||||
|
if: ${{ matrix.cuda == '12.4' }}
|
||||||
|
run: |
|
||||||
|
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||||
|
choco install unzip -y
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||||
|
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
|
||||||
|
- name: Install ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2
|
||||||
with:
|
with:
|
||||||
cuda: ${{ matrix.cuda }}
|
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
|
||||||
method: 'network'
|
|
||||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
- name: Install Ninja
|
||||||
|
id: install_ninja
|
||||||
|
run: |
|
||||||
|
choco install ninja
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
shell: cmd
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||||
cd build
|
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||||
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build --config Release
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -896,10 +907,12 @@ jobs:
|
||||||
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
- name: Copy and pack Cuda runtime
|
- name: Copy and pack Cuda runtime
|
||||||
|
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||||
run: |
|
run: |
|
||||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||||
$dst='.\build\bin\cudart\'
|
$dst='.\build\bin\cudart\'
|
||||||
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
|
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||||
|
|
||||||
- name: Upload Cuda runtime
|
- name: Upload Cuda runtime
|
||||||
|
@ -917,8 +930,8 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
||||||
env:
|
env:
|
||||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
|
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
|
||||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -928,7 +941,8 @@ jobs:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Install
|
- name: Install
|
||||||
run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
run: |
|
||||||
|
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
@ -947,25 +961,33 @@ jobs:
|
||||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Pack artifacts
|
- name: Build the release package
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
|
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||||
|
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||||
|
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||||
|
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||||
|
|
||||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload the release package
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
@ -996,12 +1018,17 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||||
|
|
||||||
|
- name: Install ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2
|
||||||
|
with:
|
||||||
|
key: ${{ github.job }}
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
|
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
|
||||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
windows-latest-cmake-hip-release:
|
windows-latest-cmake-hip-release:
|
||||||
|
@ -1016,6 +1043,8 @@ jobs:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Install
|
- name: Install
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -1037,7 +1066,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
|
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
|
||||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
md "build\bin\rocblas\library\"
|
md "build\bin\rocblas\library\"
|
||||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||||
|
@ -1125,12 +1154,10 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
needs:
|
needs:
|
||||||
- ubuntu-focal-make
|
|
||||||
- ubuntu-latest-cmake
|
- ubuntu-latest-cmake
|
||||||
- macOS-latest-make
|
|
||||||
- macOS-latest-cmake
|
- macOS-latest-cmake
|
||||||
- windows-latest-cmake
|
- windows-latest-cmake
|
||||||
- windows-latest-cmake-cuda
|
- windows-2019-cmake-cuda
|
||||||
- windows-latest-cmake-hip-release
|
- windows-latest-cmake-hip-release
|
||||||
- macOS-latest-cmake-arm64
|
- macOS-latest-cmake-arm64
|
||||||
- macOS-latest-cmake-x64
|
- macOS-latest-cmake-x64
|
||||||
|
|
13
.github/workflows/docker.yml
vendored
13
.github/workflows/docker.yml
vendored
|
@ -10,12 +10,10 @@
|
||||||
name: Publish Docker image
|
name: Publish Docker image
|
||||||
|
|
||||||
on:
|
on:
|
||||||
#pull_request:
|
workflow_dispatch: # allows manual triggering
|
||||||
push:
|
schedule:
|
||||||
branches:
|
# Rebuild daily rather than on every push because it is expensive
|
||||||
- master
|
- cron: '12 4 * * *'
|
||||||
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
|
||||||
workflow_dispatch: # allows manual triggering, useful for debugging
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
@ -29,7 +27,6 @@ permissions:
|
||||||
jobs:
|
jobs:
|
||||||
push_to_registry:
|
push_to_registry:
|
||||||
name: Push Docker image to Docker Hub
|
name: Push Docker image to Docker Hub
|
||||||
#if: github.event.pull_request.draft == false
|
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
|
@ -117,7 +114,7 @@ jobs:
|
||||||
swap-storage: true
|
swap-storage: true
|
||||||
|
|
||||||
- name: Build and push Docker image (tagged + versioned)
|
- name: Build and push Docker image (tagged + versioned)
|
||||||
if: github.event_name == 'push'
|
if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
|
|
72
.github/workflows/nix-ci-aarch64.yml
vendored
72
.github/workflows/nix-ci-aarch64.yml
vendored
|
@ -1,72 +0,0 @@
|
||||||
name: Nix aarch64 builds
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allows manual triggering
|
|
||||||
schedule:
|
|
||||||
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
|
||||||
# 1.5h instead of minutes with the cold cache).
|
|
||||||
#
|
|
||||||
# randint(0, 59), randint(0, 23)
|
|
||||||
- cron: '26 12 * * *'
|
|
||||||
# But also rebuild if we touched any of the Nix expressions:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['**/*.nix', 'flake.lock']
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['**/*.nix', 'flake.lock']
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
# Fine-grant permission
|
|
||||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
||||||
permissions:
|
|
||||||
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
nix-build-aarch64:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install QEMU
|
|
||||||
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
|
||||||
sudo usermod -a -G kvm $USER
|
|
||||||
- name: Install Nix
|
|
||||||
uses: DeterminateSystems/nix-installer-action@v9
|
|
||||||
with:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
extra-conf: |
|
|
||||||
extra-platforms = aarch64-linux
|
|
||||||
extra-system-features = nixos-test kvm
|
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
||||||
with:
|
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
||||||
- name: Set-up cachix to push the results to
|
|
||||||
uses: cachix/cachix-action@v13
|
|
||||||
with:
|
|
||||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
|
||||||
name: llama-cpp
|
|
||||||
- name: Show all output paths
|
|
||||||
run: >
|
|
||||||
nix run github:nix-community/nix-eval-jobs
|
|
||||||
-- --gc-roots-dir gcroot
|
|
||||||
--flake
|
|
||||||
".#packages.aarch64-linux"
|
|
||||||
- name: Build
|
|
||||||
run: >
|
|
||||||
nix run github:Mic92/nix-fast-build
|
|
||||||
-- --skip-cached --no-nom
|
|
||||||
--systems aarch64-linux
|
|
||||||
--flake
|
|
||||||
".#checks.aarch64-linux"
|
|
79
.github/workflows/nix-ci.yml
vendored
79
.github/workflows/nix-ci.yml
vendored
|
@ -1,79 +0,0 @@
|
||||||
name: Nix CI
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allows manual triggering
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
# Fine-grant permission
|
|
||||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
||||||
permissions:
|
|
||||||
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
nix-eval:
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
os: [ ubuntu-latest, macos-latest ]
|
|
||||||
runs-on: ${{ matrix.os }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Nix
|
|
||||||
uses: DeterminateSystems/nix-installer-action@v9
|
|
||||||
with:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
extra-conf: |
|
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
||||||
with:
|
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
||||||
- name: List all flake outputs
|
|
||||||
run: nix flake show --all-systems
|
|
||||||
- name: Show all output paths
|
|
||||||
run: >
|
|
||||||
nix run github:nix-community/nix-eval-jobs
|
|
||||||
-- --gc-roots-dir gcroot
|
|
||||||
--flake
|
|
||||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
|
||||||
nix-build:
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
os: [ ubuntu-latest, macos-latest ]
|
|
||||||
runs-on: ${{ matrix.os }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Nix
|
|
||||||
uses: DeterminateSystems/nix-installer-action@v9
|
|
||||||
with:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
extra-conf: |
|
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
||||||
with:
|
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
||||||
- name: Set-up cachix to push the results to
|
|
||||||
uses: cachix/cachix-action@v13
|
|
||||||
with:
|
|
||||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
|
||||||
name: llama-cpp
|
|
||||||
- name: Build
|
|
||||||
run: >
|
|
||||||
nix run github:Mic92/nix-fast-build
|
|
||||||
-- --skip-cached --no-nom
|
|
||||||
--flake
|
|
||||||
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
|
22
.github/workflows/nix-flake-update.yml
vendored
22
.github/workflows/nix-flake-update.yml
vendored
|
@ -1,22 +0,0 @@
|
||||||
name: update-flake-lock
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
schedule:
|
|
||||||
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
lockfile:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Nix
|
|
||||||
uses: DeterminateSystems/nix-installer-action@main
|
|
||||||
- name: Update flake.lock
|
|
||||||
uses: DeterminateSystems/update-flake-lock@main
|
|
||||||
with:
|
|
||||||
pr-title: "nix: update flake.lock"
|
|
||||||
pr-labels: |
|
|
||||||
nix
|
|
||||||
pr-reviewers: philiptaron,SomeoneSerge
|
|
||||||
token: ${{ secrets.FLAKE_TOKEN }}
|
|
36
.github/workflows/nix-publish-flake.yml
vendored
36
.github/workflows/nix-publish-flake.yml
vendored
|
@ -1,36 +0,0 @@
|
||||||
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
|
||||||
name: "Publish a flake to flakestry & flakehub"
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
- "*"
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
tag:
|
|
||||||
description: "The existing tag to publish"
|
|
||||||
type: "string"
|
|
||||||
required: true
|
|
||||||
jobs:
|
|
||||||
flakestry-publish:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
id-token: "write"
|
|
||||||
contents: "read"
|
|
||||||
steps:
|
|
||||||
- uses: flakestry/flakestry-publish@main
|
|
||||||
with:
|
|
||||||
version: "${{ inputs.tag || github.ref_name }}"
|
|
||||||
flakehub-publish:
|
|
||||||
runs-on: "ubuntu-latest"
|
|
||||||
permissions:
|
|
||||||
id-token: "write"
|
|
||||||
contents: "read"
|
|
||||||
steps:
|
|
||||||
- uses: "actions/checkout@v4"
|
|
||||||
with:
|
|
||||||
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
|
||||||
- uses: "DeterminateSystems/nix-installer-action@main"
|
|
||||||
- uses: "DeterminateSystems/flakehub-push@main"
|
|
||||||
with:
|
|
||||||
visibility: "public"
|
|
||||||
tag: "${{ inputs.tag }}"
|
|
9
.github/workflows/python-lint.yml
vendored
9
.github/workflows/python-lint.yml
vendored
|
@ -1,6 +1,13 @@
|
||||||
name: flake8 Lint
|
name: flake8 Lint
|
||||||
|
|
||||||
on: [push, pull_request]
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
|
35
.github/workflows/server.yml
vendored
35
.github/workflows/server.yml
vendored
|
@ -76,20 +76,26 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/server/tests/requirements.txt
|
pip install -r examples/server/tests/requirements.txt
|
||||||
|
|
||||||
- name: Verify server deps
|
# Setup nodejs (to be used for verifying bundled index.html)
|
||||||
id: verify_server_deps
|
- uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: 22
|
||||||
|
|
||||||
|
- name: Verify bundled index.html
|
||||||
|
id: verify_server_index_html
|
||||||
run: |
|
run: |
|
||||||
git config --global --add safe.directory $(realpath .)
|
git config --global --add safe.directory $(realpath .)
|
||||||
cd examples/server
|
cd examples/server/webui
|
||||||
git ls-files --others --modified
|
|
||||||
git status
|
git status
|
||||||
./deps.sh
|
npm ci
|
||||||
|
npm run build
|
||||||
git status
|
git status
|
||||||
not_ignored_files="$(git ls-files --others --modified)"
|
modified_files="$(git status -s)"
|
||||||
echo "Modified files: ${not_ignored_files}"
|
echo "Modified files: ${modified_files}"
|
||||||
if [ -n "${not_ignored_files}" ]; then
|
if [ -n "${modified_files}" ]; then
|
||||||
echo "Repository is dirty or server deps are not built as expected"
|
echo "Repository is dirty or server/webui is not built as expected"
|
||||||
echo "${not_ignored_files}"
|
echo "Hint: You may need to follow Web UI build guide in server/README.md"
|
||||||
|
echo "${modified_files}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -122,14 +128,14 @@ jobs:
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
PORT=8888 ./tests.sh
|
./tests.sh
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
SLOW_TESTS=1 ./tests.sh
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
|
@ -180,11 +186,12 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
$env:PYTHONIOENCODING = ":replace"
|
$env:PYTHONIOENCODING = ":replace"
|
||||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
pytest -v -x
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
behave.exe --stop --no-skipped --no-capture --tags slow
|
$env:SLOW_TESTS = "1"
|
||||||
|
pytest -v -x
|
||||||
|
|
9
.gitignore
vendored
9
.gitignore
vendored
|
@ -3,6 +3,7 @@
|
||||||
*.a
|
*.a
|
||||||
*.bat
|
*.bat
|
||||||
*.bin
|
*.bin
|
||||||
|
*.d
|
||||||
*.dll
|
*.dll
|
||||||
*.dot
|
*.dot
|
||||||
*.etag
|
*.etag
|
||||||
|
@ -103,6 +104,10 @@ examples/server/*.mjs.hpp
|
||||||
!examples/sycl/*.bat
|
!examples/sycl/*.bat
|
||||||
!examples/sycl/*.sh
|
!examples/sycl/*.sh
|
||||||
|
|
||||||
|
# Server Web UI temporary files
|
||||||
|
node_modules
|
||||||
|
examples/server/webui/dist
|
||||||
|
|
||||||
# Python
|
# Python
|
||||||
|
|
||||||
/.venv
|
/.venv
|
||||||
|
@ -133,3 +138,7 @@ poetry.toml
|
||||||
|
|
||||||
# Test models for lora adapters
|
# Test models for lora adapters
|
||||||
/lora-tests
|
/lora-tests
|
||||||
|
|
||||||
|
# Local scripts
|
||||||
|
/run-vim.sh
|
||||||
|
/run-chat.sh
|
||||||
|
|
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
||||||
[submodule "kompute"]
|
[submodule "kompute"]
|
||||||
path = ggml/src/kompute
|
path = ggml/src/ggml-kompute/kompute
|
||||||
url = https://github.com/nomic-ai/kompute.git
|
url = https://github.com/nomic-ai/kompute.git
|
||||||
|
|
186
AUTHORS
186
AUTHORS
|
@ -1,4 +1,4 @@
|
||||||
# date: Wed Jun 26 19:36:34 EEST 2024
|
# date: Thu Nov 28 20:46:15 EET 2024
|
||||||
# this file is auto-generated by scripts/gen-authors.sh
|
# this file is auto-generated by scripts/gen-authors.sh
|
||||||
|
|
||||||
0cc4m <picard12@live.de>
|
0cc4m <picard12@live.de>
|
||||||
|
@ -7,6 +7,7 @@
|
||||||
2f38b454 <dxf@protonmail.com>
|
2f38b454 <dxf@protonmail.com>
|
||||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||||
44670 <44670@users.noreply.github.com>
|
44670 <44670@users.noreply.github.com>
|
||||||
|
65a <10104049+65a@users.noreply.github.com>
|
||||||
AN Long <aisk@users.noreply.github.com>
|
AN Long <aisk@users.noreply.github.com>
|
||||||
AT <manyoso@users.noreply.github.com>
|
AT <manyoso@users.noreply.github.com>
|
||||||
Aarni Koskela <akx@iki.fi>
|
Aarni Koskela <akx@iki.fi>
|
||||||
|
@ -19,20 +20,28 @@ Adithya Balaji <adithya.b94@gmail.com>
|
||||||
AdithyanI <adithyan.i4internet@gmail.com>
|
AdithyanI <adithyan.i4internet@gmail.com>
|
||||||
Adrian <smith.adriane@gmail.com>
|
Adrian <smith.adriane@gmail.com>
|
||||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||||
|
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
|
||||||
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
||||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||||
|
AidanBeltonS <aidan.belton@codeplay.com>
|
||||||
Aisuko <urakiny@gmail.com>
|
Aisuko <urakiny@gmail.com>
|
||||||
|
Akarshan Biswas <akarshan.biswas@gmail.com>
|
||||||
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
||||||
|
Al Mochkin <14274697+amochkin@users.noreply.github.com>
|
||||||
Albert Jin <albert.jin@gmail.com>
|
Albert Jin <albert.jin@gmail.com>
|
||||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||||
|
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
|
||||||
|
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
|
||||||
Alex <awhill19@icloud.com>
|
Alex <awhill19@icloud.com>
|
||||||
Alex Azarov <alex@azarov.by>
|
Alex Azarov <alex@azarov.by>
|
||||||
Alex Azarov <alexander.azarov@mapbox.com>
|
Alex Azarov <alexander.azarov@mapbox.com>
|
||||||
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||||
Alex Klinkhamer <git@grencez.dev>
|
Alex Klinkhamer <git@grencez.dev>
|
||||||
Alex Nguyen <tiendung@users.noreply.github.com>
|
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||||
|
Alex O'Connell <35843486+acon96@users.noreply.github.com>
|
||||||
Alex Petenchea <alex.petenchea@gmail.com>
|
Alex Petenchea <alex.petenchea@gmail.com>
|
||||||
Alex Renda <alexrenda@users.noreply.github.com>
|
Alex Renda <alexrenda@users.noreply.github.com>
|
||||||
|
Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
|
||||||
Alex von Gluck IV <kallisti5@unixzen.com>
|
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||||
Alexey Parfenov <zxed@alkatrazstudio.net>
|
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||||
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||||
|
@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||||
Ananta Bastola <anantarajbastola@gmail.com>
|
Ananta Bastola <anantarajbastola@gmail.com>
|
||||||
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||||
András Salamon <ott2@users.noreply.github.com>
|
András Salamon <ott2@users.noreply.github.com>
|
||||||
|
Andreas (Andi) Kunar <andreask@msn.com>
|
||||||
Andrei <abetlen@gmail.com>
|
Andrei <abetlen@gmail.com>
|
||||||
Andrew Canis <andrew.canis@gmail.com>
|
Andrew Canis <andrew.canis@gmail.com>
|
||||||
Andrew Downing <andrew2085@gmail.com>
|
Andrew Downing <andrew2085@gmail.com>
|
||||||
Andrew Duffy <a10y@users.noreply.github.com>
|
Andrew Duffy <a10y@users.noreply.github.com>
|
||||||
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||||
|
Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
|
||||||
|
Andy Salerno <andysalerno@gmail.com>
|
||||||
Andy Tai <andy-tai@users.noreply.github.com>
|
Andy Tai <andy-tai@users.noreply.github.com>
|
||||||
|
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
|
||||||
|
Antonis Makropoulos <benuix@gmail.com>
|
||||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||||
|
Armen Kaleshian <kriation@users.noreply.github.com>
|
||||||
Artem <guinmoon@gmail.com>
|
Artem <guinmoon@gmail.com>
|
||||||
Artem Zinnatullin <ceo@abstractny.gay>
|
Artem Zinnatullin <ceo@abstractny.gay>
|
||||||
Artyom Lebedev <vagran.ast@gmail.com>
|
Artyom Lebedev <vagran.ast@gmail.com>
|
||||||
Asbjørn Olling <asbjornolling@gmail.com>
|
Asbjørn Olling <asbjornolling@gmail.com>
|
||||||
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||||
|
Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
|
||||||
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
||||||
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||||
Ashraful Islam <ashraful.meche@gmail.com>
|
Ashraful Islam <ashraful.meche@gmail.com>
|
||||||
|
@ -76,12 +92,16 @@ Ben Williams <ben@719ben.com>
|
||||||
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
||||||
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||||
Bernat Vadell <hounter.caza@gmail.com>
|
Bernat Vadell <hounter.caza@gmail.com>
|
||||||
|
Bert Wagner <github@bertwagner.com>
|
||||||
Bingan <70050083+binganao@users.noreply.github.com>
|
Bingan <70050083+binganao@users.noreply.github.com>
|
||||||
|
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
|
||||||
Bodo Graumann <mail@bodograumann.de>
|
Bodo Graumann <mail@bodograumann.de>
|
||||||
Bono Lv <lvscar@users.noreply.github.com>
|
Bono Lv <lvscar@users.noreply.github.com>
|
||||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||||
Branden Butler <bwtbutler@hotmail.com>
|
Branden Butler <bwtbutler@hotmail.com>
|
||||||
|
Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
|
||||||
Brian <mofosyne@gmail.com>
|
Brian <mofosyne@gmail.com>
|
||||||
|
Brian Cunnie <brian.cunnie@gmail.com>
|
||||||
Bruce MacDonald <brucewmacdonald@gmail.com>
|
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
Bryan Honof <bryanhonof@gmail.com>
|
Bryan Honof <bryanhonof@gmail.com>
|
||||||
CJ Pais <cj@cjpais.com>
|
CJ Pais <cj@cjpais.com>
|
||||||
|
@ -90,32 +110,47 @@ Calvin Laurenson <calvin@laurenson.dev>
|
||||||
Cameron <csteele@steelecameron.com>
|
Cameron <csteele@steelecameron.com>
|
||||||
Cameron Kaiser <classilla@users.noreply.github.com>
|
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||||
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
||||||
|
CarryFun <76023481+CarryFun@users.noreply.github.com>
|
||||||
|
Carsten Kragelund Jørgensen <carsten@kragelund.me>
|
||||||
|
CarterLi999 <664681047@qq.com>
|
||||||
Casey Primozic <casey@cprimozic.net>
|
Casey Primozic <casey@cprimozic.net>
|
||||||
Casey Primozic <me@ameo.link>
|
Casey Primozic <me@ameo.link>
|
||||||
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||||
Cebtenzzre <cebtenzzre@gmail.com>
|
Cebtenzzre <cebtenzzre@gmail.com>
|
||||||
Chad Brewbaker <crb002@gmail.com>
|
Chad Brewbaker <crb002@gmail.com>
|
||||||
|
Changyeon Kim <cyzero.kim@samsung.com>
|
||||||
Chao Jiang <jc19chaoj@zoho.com>
|
Chao Jiang <jc19chaoj@zoho.com>
|
||||||
|
Charles Xu <63788048+chaxu01@users.noreply.github.com>
|
||||||
|
Charles Xu <charles.xu@arm.com>
|
||||||
|
Chen Xi <xi2.chen@intel.com>
|
||||||
|
Chen Xi <xixichen08@foxmail.com>
|
||||||
Cheng Shao <terrorjack@type.dance>
|
Cheng Shao <terrorjack@type.dance>
|
||||||
|
Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
|
||||||
Chris Elrod <elrodc@gmail.com>
|
Chris Elrod <elrodc@gmail.com>
|
||||||
Chris Kuehl <ckuehl@ckuehl.me>
|
Chris Kuehl <ckuehl@ckuehl.me>
|
||||||
Christian Demsar <christian@github.email.demsar.us>
|
Christian Demsar <christian@github.email.demsar.us>
|
||||||
Christian Demsar <crasm@git.vczf.us>
|
Christian Demsar <crasm@git.vczf.us>
|
||||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||||
Christian Kögler <ck3d@gmx.de>
|
Christian Kögler <ck3d@gmx.de>
|
||||||
|
Christian Köhnenkamp <cvk5@me.com>
|
||||||
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
||||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||||
Clint Herron <hanclinto@gmail.com>
|
Clint Herron <hanclinto@gmail.com>
|
||||||
|
Conrad Kramer <conrad@conradkramer.com>
|
||||||
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
||||||
|
Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
|
||||||
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||||
DAN™ <dranger003@gmail.com>
|
DAN™ <dranger003@gmail.com>
|
||||||
Damian Stewart <d@damianstewart.com>
|
Damian Stewart <d@damianstewart.com>
|
||||||
|
Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
|
||||||
|
Dan Johansson <dan.johansson@arm.com>
|
||||||
Dane Madsen <dane_madsen@hotmail.com>
|
Dane Madsen <dane_madsen@hotmail.com>
|
||||||
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||||
Daniel Bevenius <daniel.bevenius@gmail.com>
|
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||||
Daniel Drake <drake@endlessos.org>
|
Daniel Drake <drake@endlessos.org>
|
||||||
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||||
|
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
|
||||||
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
||||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||||
|
@ -129,19 +164,28 @@ David Pflug <david@pflug.email>
|
||||||
David Renshaw <dwrenshaw@gmail.com>
|
David Renshaw <dwrenshaw@gmail.com>
|
||||||
David Sommers <12738+databyte@users.noreply.github.com>
|
David Sommers <12738+databyte@users.noreply.github.com>
|
||||||
David Yang <davidyang6us@gmail.com>
|
David Yang <davidyang6us@gmail.com>
|
||||||
|
DavidKorczynski <david@adalogics.com>
|
||||||
Dawid Potocki <github@dawidpotocki.com>
|
Dawid Potocki <github@dawidpotocki.com>
|
||||||
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||||
Dean <Dean.Sinaean@gmail.com>
|
Dean <Dean.Sinaean@gmail.com>
|
||||||
Deins <deinsegle@gmail.com>
|
Deins <deinsegle@gmail.com>
|
||||||
|
Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
|
||||||
|
Derrick T. Woolworth <dwoolworth@gmail.com>
|
||||||
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
||||||
|
Dibakar Gope <dibakar.gope@arm.com>
|
||||||
Didzis Gosko <didzis@users.noreply.github.com>
|
Didzis Gosko <didzis@users.noreply.github.com>
|
||||||
|
Diego Devesa <slarengh@gmail.com>
|
||||||
|
Diogo Teles Sant'Anna <diogoteles@google.com>
|
||||||
Djip007 <djip.perois@free.fr>
|
Djip007 <djip.perois@free.fr>
|
||||||
Don Mahurin <dmahurin@users.noreply.github.com>
|
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||||
DooWoong Lee (David) <manics99@naver.com>
|
DooWoong Lee (David) <manics99@naver.com>
|
||||||
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||||
|
Dou Xinpeng <15529241576@163.com>
|
||||||
|
Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
|
||||||
Douglas Hanley <thesecretaryofwar@gmail.com>
|
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||||
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||||
Ebey Abraham <ebey97@gmail.com>
|
Ebey Abraham <ebey97@gmail.com>
|
||||||
|
Echo Nolan <echo@echonolan.net>
|
||||||
Ed Lee <edilee@mozilla.com>
|
Ed Lee <edilee@mozilla.com>
|
||||||
Ed Lepedus <ed.lepedus@googlemail.com>
|
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||||
Eddie-Wang <wangjinheng1120@163.com>
|
Eddie-Wang <wangjinheng1120@163.com>
|
||||||
|
@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com>
|
||||||
Elton Kola <eltonkola@gmail.com>
|
Elton Kola <eltonkola@gmail.com>
|
||||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||||
Equim <sayaka@ekyu.moe>
|
Equim <sayaka@ekyu.moe>
|
||||||
|
Eric Curtin <ecurtin@redhat.com>
|
||||||
|
Eric Curtin <ericcurtin17@gmail.com>
|
||||||
Eric Sommerlade <es0m@users.noreply.github.com>
|
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||||
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||||
Erik Garrison <erik.garrison@gmail.com>
|
Erik Garrison <erik.garrison@gmail.com>
|
||||||
Erik Scholz <Green-Sky@users.noreply.github.com>
|
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||||
|
Esko Toivonen <eskot98@gmail.com>
|
||||||
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||||
Evan Jones <evan.q.jones@gmail.com>
|
Evan Jones <evan.q.jones@gmail.com>
|
||||||
Evan Miller <emmiller@gmail.com>
|
Evan Miller <emmiller@gmail.com>
|
||||||
|
@ -166,19 +213,26 @@ FK <sozforex@gmail.com>
|
||||||
Fabian <cmdrf@users.noreply.github.com>
|
Fabian <cmdrf@users.noreply.github.com>
|
||||||
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||||
Faez Shakil <faez.shakil@gmail.com>
|
Faez Shakil <faez.shakil@gmail.com>
|
||||||
|
Faisal Zaghloul <faisal.zaghloul@gmail.com>
|
||||||
|
Faisal Zaghloul <quic_fzaghlou@quicinc.com>
|
||||||
|
Fan Shupei <dymarkfan@outlook.com>
|
||||||
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||||
|
Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
|
||||||
Fattire <528174+fat-tire@users.noreply.github.com>
|
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||||
Felix <stenbackfelix@gmail.com>
|
Felix <stenbackfelix@gmail.com>
|
||||||
Finn Voorhees <finnvoorhees@gmail.com>
|
Finn Voorhees <finnvoorhees@gmail.com>
|
||||||
Firat <firatkiral@gmail.com>
|
Firat <firatkiral@gmail.com>
|
||||||
|
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
|
||||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||||
Frank Mai <thxcode0824@gmail.com>
|
Frank Mai <thxcode0824@gmail.com>
|
||||||
FrankHB <frankhb1989@gmail.com>
|
FrankHB <frankhb1989@gmail.com>
|
||||||
|
Frankie Robertson <frankier@users.noreply.github.com>
|
||||||
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
||||||
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||||
Gabe Goodhart <gabe.l.hart@gmail.com>
|
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||||
|
Gabe Goodhart <ghart@us.ibm.com>
|
||||||
GainLee <perfecter.gen@gmail.com>
|
GainLee <perfecter.gen@gmail.com>
|
||||||
Galunid <karolek1231456@gmail.com>
|
Galunid <karolek1231456@gmail.com>
|
||||||
Gary Linscott <glinscott@gmail.com>
|
Gary Linscott <glinscott@gmail.com>
|
||||||
|
@ -187,11 +241,13 @@ Gavin Zhao <gavinzhaojw@protonmail.com>
|
||||||
Genkagaku.GPT <hlhr202@163.com>
|
Genkagaku.GPT <hlhr202@163.com>
|
||||||
Georgi Gerganov <ggerganov@gmail.com>
|
Georgi Gerganov <ggerganov@gmail.com>
|
||||||
Gilad S <giladgd@users.noreply.github.com>
|
Gilad S <giladgd@users.noreply.github.com>
|
||||||
|
Gilad S. <7817232+giladgd@users.noreply.github.com>
|
||||||
Giuseppe Scrivano <giuseppe@scrivano.org>
|
Giuseppe Scrivano <giuseppe@scrivano.org>
|
||||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||||
Govlzkoy <gotope@users.noreply.github.com>
|
Govlzkoy <gotope@users.noreply.github.com>
|
||||||
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||||
|
Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
|
||||||
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||||
Haggai Nuchi <h.nuchi@gmail.com>
|
Haggai Nuchi <h.nuchi@gmail.com>
|
||||||
|
@ -213,11 +269,14 @@ Hong Bo PENG <penghb@cn.ibm.com>
|
||||||
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||||
Howard Su <howard0su@gmail.com>
|
Howard Su <howard0su@gmail.com>
|
||||||
Hua Jiang <allenhjiang@outlook.com>
|
Hua Jiang <allenhjiang@outlook.com>
|
||||||
|
Huang Qi <huangqi3@xiaomi.com>
|
||||||
Huawei Lin <huaweilin.cs@gmail.com>
|
Huawei Lin <huaweilin.cs@gmail.com>
|
||||||
Hugo Roussel <hugo.rous@gmail.com>
|
Hugo Roussel <hugo.rous@gmail.com>
|
||||||
|
Huifeng Ou <79071290+ho2103@users.noreply.github.com>
|
||||||
Ian Bull <irbull@eclipsesource.com>
|
Ian Bull <irbull@eclipsesource.com>
|
||||||
Ian Bull <irbull@gmail.com>
|
Ian Bull <irbull@gmail.com>
|
||||||
Ian Scrivener <github@zilogy.asia>
|
Ian Scrivener <github@zilogy.asia>
|
||||||
|
Icecream95 <the.real.icecream95@gmail.com>
|
||||||
Ido S <ido.pluto@gmail.com>
|
Ido S <ido.pluto@gmail.com>
|
||||||
IgnacioFDM <ignaciofdm@gmail.com>
|
IgnacioFDM <ignaciofdm@gmail.com>
|
||||||
Igor Okulist <okigan@gmail.com>
|
Igor Okulist <okigan@gmail.com>
|
||||||
|
@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||||
Ionoclast Laboratories <brigham@ionoclast.com>
|
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||||
Isaac McFadyen <isaac@imcf.me>
|
Isaac McFadyen <isaac@imcf.me>
|
||||||
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||||
|
Ivan <nekotekina@gmail.com>
|
||||||
|
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
|
||||||
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||||
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||||
|
Jack Mousseau <jack@software.inc>
|
||||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||||
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||||
|
Jaeden Amero <jaeden@patater.com>
|
||||||
Jaemin Son <woalsdnd@gmail.com>
|
Jaemin Son <woalsdnd@gmail.com>
|
||||||
Jag Chadha <jagtesh@gmail.com>
|
Jag Chadha <jagtesh@gmail.com>
|
||||||
Jakub N <jakubniemczyk97@gmail.com>
|
Jakub N <jakubniemczyk97@gmail.com>
|
||||||
|
@ -243,10 +306,14 @@ Jannis Schönleber <joennlae@gmail.com>
|
||||||
Jared Van Bortel <cebtenzzre@gmail.com>
|
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||||
Jared Van Bortel <jared@nomic.ai>
|
Jared Van Bortel <jared@nomic.ai>
|
||||||
Jason McCartney <jmac@theroot.org>
|
Jason McCartney <jmac@theroot.org>
|
||||||
|
Jason Stillerman <jason.t.stillerman@gmail.com>
|
||||||
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||||
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||||
Jed Fox <git@jedfox.com>
|
Jed Fox <git@jedfox.com>
|
||||||
|
Jeff Bolz <jbolz@nvidia.com>
|
||||||
|
Jeffrey Morgan <jmorganca@gmail.com>
|
||||||
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||||
|
Jeroen Mostert <jeroen.mostert@cm.com>
|
||||||
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||||
Jeximo <jeximo@gmail.com>
|
Jeximo <jeximo@gmail.com>
|
||||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||||
|
@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||||
Jiří Sejkora <Sejseloid@gmail.com>
|
Jiří Sejkora <Sejseloid@gmail.com>
|
||||||
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
||||||
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
||||||
|
João Dinis Ferreira <hello@joaof.eu>
|
||||||
|
Joe Eli McIlvain <joe.eli.mac@gmail.com>
|
||||||
|
Joe Todd <joe.todd@codeplay.com>
|
||||||
Johan <JohanAR@users.noreply.github.com>
|
Johan <JohanAR@users.noreply.github.com>
|
||||||
Johannes Gäßler <johannesg@5d6.de>
|
Johannes Gäßler <johannesg@5d6.de>
|
||||||
Johannes Rudolph <johannes.rudolph@gmail.com>
|
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||||
|
@ -274,7 +344,9 @@ Joyce <joycebrum@google.com>
|
||||||
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||||
Judd <foldl@users.noreply.github.com>
|
Judd <foldl@users.noreply.github.com>
|
||||||
Julius Arkenberg <arki05@users.noreply.github.com>
|
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||||
|
Jun Hee Yoo <contact.jhyoo@gmail.com>
|
||||||
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||||
|
Junil Kim <logyourself@gmail.com>
|
||||||
Junyang Lin <justinlin930319@hotmail.com>
|
Junyang Lin <justinlin930319@hotmail.com>
|
||||||
Juraj Bednar <juraj@bednar.io>
|
Juraj Bednar <juraj@bednar.io>
|
||||||
Justin Parker <jparkerweb@gmail.com>
|
Justin Parker <jparkerweb@gmail.com>
|
||||||
|
@ -292,12 +364,14 @@ Karthik Sethuraman <k.seth1993@gmail.com>
|
||||||
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||||
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||||
Keiichi Tabata <keiichi.tabata@outlook.com>
|
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||||
|
Keke Han <hankeke303@163.com>
|
||||||
Kenvix ⭐ <kenvixzure@live.com>
|
Kenvix ⭐ <kenvixzure@live.com>
|
||||||
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||||
Kevin Gibbons <bakkot@gmail.com>
|
Kevin Gibbons <bakkot@gmail.com>
|
||||||
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||||
Kevin Kwok <antimatter15@gmail.com>
|
Kevin Kwok <antimatter15@gmail.com>
|
||||||
Kevin Lo <kevlo@kevlo.org>
|
Kevin Lo <kevlo@kevlo.org>
|
||||||
|
Kevin Wang <kevmo314@gmail.com>
|
||||||
Kolen Cheung <ickc@users.noreply.github.com>
|
Kolen Cheung <ickc@users.noreply.github.com>
|
||||||
Konstantin Herud <konstantin.herud@denkbares.com>
|
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||||
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||||
|
@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||||
Leonardo Neumann <leonardo@neumann.dev.br>
|
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||||
Li Tan <tanliboy@gmail.com>
|
Li Tan <tanliboy@gmail.com>
|
||||||
Linwei Wang <wanix1988@gmail.com>
|
Linwei Wang <wanix1988@gmail.com>
|
||||||
|
Liu Jia <109258120+Septa2112@users.noreply.github.com>
|
||||||
|
Liu Jia <jia3.liu@intel.com>
|
||||||
LoganDark <github@logandark.mozmail.com>
|
LoganDark <github@logandark.mozmail.com>
|
||||||
|
Loïc Carrère <loic.carrere@gmail.com>
|
||||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||||
Luciano <lucianostrika44@gmail.com>
|
Luciano <lucianostrika44@gmail.com>
|
||||||
Luo Tian <lt@basecity.com>
|
Luo Tian <lt@basecity.com>
|
||||||
Lyle Dean <dean@lyle.dev>
|
Lyle Dean <dean@lyle.dev>
|
||||||
|
M-A <maruel@gmail.com>
|
||||||
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||||
|
Ma Mingfei <mingfei.ma@intel.com>
|
||||||
Maarten ter Huurne <maarten@treewalker.org>
|
Maarten ter Huurne <maarten@treewalker.org>
|
||||||
Mack Straight <eiz@users.noreply.github.com>
|
Mack Straight <eiz@users.noreply.github.com>
|
||||||
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||||
MaggotHATE <clay1326@gmail.com>
|
MaggotHATE <clay1326@gmail.com>
|
||||||
|
Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
|
||||||
Manuel <44313466+makuche@users.noreply.github.com>
|
Manuel <44313466+makuche@users.noreply.github.com>
|
||||||
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||||
Marco Matthies <71844+marcom@users.noreply.github.com>
|
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||||
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||||
Marian Cepok <marian.cepok@gmail.com>
|
Marian Cepok <marian.cepok@gmail.com>
|
||||||
Mark Fairbairn <thebaron88@gmail.com>
|
Mark Fairbairn <thebaron88@gmail.com>
|
||||||
|
Mark Zhuang <zhuangqiubin@gmail.com>
|
||||||
Marko Tasic <mtasic85@gmail.com>
|
Marko Tasic <mtasic85@gmail.com>
|
||||||
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
||||||
Martin Delille <martin@delille.org>
|
Martin Delille <martin@delille.org>
|
||||||
|
@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
|
||||||
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||||
Matheus C. França <matheus-catarino@hotmail.com>
|
Matheus C. França <matheus-catarino@hotmail.com>
|
||||||
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||||
|
Mathieu Geli <mathieu.geli@gmail.com>
|
||||||
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||||
|
Mathijs Henquet <mathijs.henquet@gmail.com>
|
||||||
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||||
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||||
Matt Pulver <matt.pulver@heavy.ai>
|
Matt Pulver <matt.pulver@heavy.ai>
|
||||||
|
Matt Stephenson <mstephenson6@users.noreply.github.com>
|
||||||
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||||
|
Matteo Mortari <matteo.mortari@gmail.com>
|
||||||
Mattheus Chediak <shammcity00@gmail.com>
|
Mattheus Chediak <shammcity00@gmail.com>
|
||||||
Matthew Tejo <matthew.tejo@gmail.com>
|
Matthew Tejo <matthew.tejo@gmail.com>
|
||||||
Matvey Soloviev <blackhole89@gmail.com>
|
Matvey Soloviev <blackhole89@gmail.com>
|
||||||
|
@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com>
|
||||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||||
Meng Zhang <meng@tabbyml.com>
|
Meng Zhang <meng@tabbyml.com>
|
||||||
Meng, Hengyu <hengyu.meng@intel.com>
|
Meng, Hengyu <hengyu.meng@intel.com>
|
||||||
|
Mengqing Cao <cmq0113@163.com>
|
||||||
Merrick Christensen <merrick.christensen@gmail.com>
|
Merrick Christensen <merrick.christensen@gmail.com>
|
||||||
Michael Coppola <m18coppola@gmail.com>
|
Michael Coppola <m18coppola@gmail.com>
|
||||||
|
Michael Francis <edude03@gmail.com>
|
||||||
Michael Hueschen <m@mhueschen.dev>
|
Michael Hueschen <m@mhueschen.dev>
|
||||||
Michael Kesper <mkesper@schokokeks.org>
|
Michael Kesper <mkesper@schokokeks.org>
|
||||||
Michael Klimenko <mklimenko29@gmail.com>
|
Michael Klimenko <mklimenko29@gmail.com>
|
||||||
|
@ -365,41 +452,57 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||||
Michael Potter <NanoTekGuy@Gmail.com>
|
Michael Potter <NanoTekGuy@Gmail.com>
|
||||||
Michael de Gans <michael.john.degans@gmail.com>
|
Michael de Gans <michael.john.degans@gmail.com>
|
||||||
Michaël de Vries <vriesdemichael@gmail.com>
|
Michaël de Vries <vriesdemichael@gmail.com>
|
||||||
|
Michał Tuszyński <srgtuszy@gmail.com>
|
||||||
Mihai <mihai.chirculescu@yahoo.com>
|
Mihai <mihai.chirculescu@yahoo.com>
|
||||||
Mike <ytianhui2004@gmail.com>
|
Mike <ytianhui2004@gmail.com>
|
||||||
Mikko Juola <mikjuo@gmail.com>
|
Mikko Juola <mikjuo@gmail.com>
|
||||||
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||||
|
Minsoo Cheong <icycle0409@snu.ac.kr>
|
||||||
Mirko185 <mirkosig@gmail.com>
|
Mirko185 <mirkosig@gmail.com>
|
||||||
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||||
|
MistApproach <98988043+MistApproach@users.noreply.github.com>
|
||||||
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||||
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
||||||
|
Molly Sophia <mollysophia379@gmail.com>
|
||||||
|
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
|
||||||
Murilo Santana <mvrilo@gmail.com>
|
Murilo Santana <mvrilo@gmail.com>
|
||||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||||
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||||
Nathan Epstein <nate2@umbc.edu>
|
Nathan Epstein <nate2@umbc.edu>
|
||||||
|
Natsu <chino@hotococoa.moe>
|
||||||
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||||
Nebula <infinitewormhole@gmail.com>
|
Nebula <infinitewormhole@gmail.com>
|
||||||
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
||||||
Neo Zhang <zhang.jianyu@outlook.com>
|
Neo Zhang <zhang.jianyu@outlook.com>
|
||||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||||
Neuman Vong <neuman.vong@gmail.com>
|
Neuman Vong <neuman.vong@gmail.com>
|
||||||
|
Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
|
||||||
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||||
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||||
|
Nicholai Tukanov <nicholaitukanov@gmail.com>
|
||||||
|
Nico Bosshard <nico@bosshome.ch>
|
||||||
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||||
Nicolás Pérez <nicolas_perez@brown.edu>
|
Nicolás Pérez <nicolas_perez@brown.edu>
|
||||||
Nigel Bosch <pnigelb@gmail.com>
|
Nigel Bosch <pnigelb@gmail.com>
|
||||||
Niklas Korz <niklas@niklaskorz.de>
|
Niklas Korz <niklas@niklaskorz.de>
|
||||||
|
NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
|
||||||
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
||||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||||
|
OSecret <135510162+OLSecret@users.noreply.github.com>
|
||||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||||
Ondřej Čertík <ondrej@certik.us>
|
Ondřej Čertík <ondrej@certik.us>
|
||||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||||
|
PAB <pierreantoine.bannier@gmail.com>
|
||||||
|
Pablo Duboue <pablo.duboue@gmail.com>
|
||||||
|
Pascal Patry <ppatry@mtacitlabs.com>
|
||||||
Patrice Ferlet <metal3d@gmail.com>
|
Patrice Ferlet <metal3d@gmail.com>
|
||||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||||
|
Pavel Zloi <github.com@drteam.rocks>
|
||||||
Pavol Rusnak <pavol@rusnak.io>
|
Pavol Rusnak <pavol@rusnak.io>
|
||||||
|
Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
|
||||||
Pedro Cuenca <pedro@huggingface.co>
|
Pedro Cuenca <pedro@huggingface.co>
|
||||||
Peter Sugihara <peter@campsh.com>
|
Peter Sugihara <peter@campsh.com>
|
||||||
Phil H <5756783+phiharri@users.noreply.github.com>
|
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||||
|
@ -407,10 +510,15 @@ Philip Taron <philip.taron@gmail.com>
|
||||||
Phillip Kravtsov <phillip@kravtsov.net>
|
Phillip Kravtsov <phillip@kravtsov.net>
|
||||||
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||||
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||||
|
Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
|
||||||
|
Plamen Minev <pacominev@gmail.com>
|
||||||
|
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
|
||||||
Przemysław Pawełczyk <przemoc@gmail.com>
|
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||||
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||||
Qingyou Meng <meng.qingyou@gmail.com>
|
Qingyou Meng <meng.qingyou@gmail.com>
|
||||||
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||||
|
R0CKSTAR <xiaodong.ye@mthreads.com>
|
||||||
|
R0CKSTAR <yeahdongcn@gmail.com>
|
||||||
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||||
Radoslav Gerganov <rgerganov@gmail.com>
|
Radoslav Gerganov <rgerganov@gmail.com>
|
||||||
Radosław Gryta <radek.gryta@gmail.com>
|
Radosław Gryta <radek.gryta@gmail.com>
|
||||||
|
@ -419,11 +527,13 @@ Raj Hammeer Singh Hada <hammeerraj@gmail.com>
|
||||||
Ralph Soika <ralph.soika@imixs.com>
|
Ralph Soika <ralph.soika@imixs.com>
|
||||||
Rand Xie <randxiexyy29@gmail.com>
|
Rand Xie <randxiexyy29@gmail.com>
|
||||||
Randall Fitzgerald <randall@dasaku.net>
|
Randall Fitzgerald <randall@dasaku.net>
|
||||||
|
Random Fly <renfei8@live.cn>
|
||||||
Reinforce-II <fate@eastal.com>
|
Reinforce-II <fate@eastal.com>
|
||||||
Ren Xuancheng <jklj077@users.noreply.github.com>
|
Ren Xuancheng <jklj077@users.noreply.github.com>
|
||||||
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
||||||
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
||||||
Riceball LEE <snowyu.lee@gmail.com>
|
Riceball LEE <snowyu.lee@gmail.com>
|
||||||
|
Rich Dougherty <rich@rd.nz>
|
||||||
Richard Kiss <him@richardkiss.com>
|
Richard Kiss <him@richardkiss.com>
|
||||||
Richard Roberson <richardr1126@gmail.com>
|
Richard Roberson <richardr1126@gmail.com>
|
||||||
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||||
|
@ -439,21 +549,30 @@ Robey Holderith <robey@flaminglunchbox.net>
|
||||||
Robyn <robyngraf@users.noreply.github.com>
|
Robyn <robyngraf@users.noreply.github.com>
|
||||||
Roger Meier <r.meier@siemens.com>
|
Roger Meier <r.meier@siemens.com>
|
||||||
Roland <14355895+rbur0425@users.noreply.github.com>
|
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||||
|
Romain Biessy <romain.biessy@codeplay.com>
|
||||||
Romain D <90720+Artefact2@users.noreply.github.com>
|
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||||
Romain Neutron <romain@neutron.io>
|
Romain Neutron <romain@neutron.io>
|
||||||
Roman Parykin <donderom@gmail.com>
|
Roman Parykin <donderom@gmail.com>
|
||||||
Ron Evans <ron@hybridgroup.com>
|
Ron Evans <ron@hybridgroup.com>
|
||||||
Ron Jailall <rojailal@gmail.com>
|
Ron Jailall <rojailal@gmail.com>
|
||||||
|
Roni <sulpher@gmx.net>
|
||||||
Ronny Brendel <ronnybrendel@gmail.com>
|
Ronny Brendel <ronnybrendel@gmail.com>
|
||||||
Ronsor <ronsor@ronsor.pw>
|
Ronsor <ronsor@ronsor.pw>
|
||||||
Rowan Hart <rowanbhart@gmail.com>
|
Rowan Hart <rowanbhart@gmail.com>
|
||||||
|
Ruchira Hasaranga <ruchira66@gmail.com>
|
||||||
|
Ruixin Huang <18860020911@163.com>
|
||||||
Rune <43761327+Rune-AI@users.noreply.github.com>
|
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||||
|
RunningLeon <maningsheng@sensetime.com>
|
||||||
|
RunningLeon <mnsheng@yeah.net>
|
||||||
Ryan Landay <rlanday@gmail.com>
|
Ryan Landay <rlanday@gmail.com>
|
||||||
Ryder Wishart <ryderwishart@gmail.com>
|
Ryder Wishart <ryderwishart@gmail.com>
|
||||||
Ryuei <louixs@users.noreply.github.com>
|
Ryuei <louixs@users.noreply.github.com>
|
||||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||||
|
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
|
||||||
|
SXX <sxx1136965276@gmail.com>
|
||||||
SakuraUmi <yukinon244@gmail.com>
|
SakuraUmi <yukinon244@gmail.com>
|
||||||
Salvador E. Tropea <stropea@inti.gob.ar>
|
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||||
|
Salvatore Mesoraca <s.mesoraca16@gmail.com>
|
||||||
Sam Spilsbury <smspillaz@gmail.com>
|
Sam Spilsbury <smspillaz@gmail.com>
|
||||||
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||||
Samuel Maynard <samwmaynard@gmail.com>
|
Samuel Maynard <samwmaynard@gmail.com>
|
||||||
|
@ -463,23 +582,29 @@ Sebastián A <sebastian.aedo29@gmail.com>
|
||||||
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||||
Senemu <10880819+Senemu@users.noreply.github.com>
|
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||||
Sergey Alirzaev <zl29ah@gmail.com>
|
Sergey Alirzaev <zl29ah@gmail.com>
|
||||||
|
Sergio López <slp@redhat.com>
|
||||||
Sergio López <slp@sinrega.org>
|
Sergio López <slp@sinrega.org>
|
||||||
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
||||||
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||||
ShadovvBeast <ShadovvBeast@gmail.com>
|
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||||
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||||
|
Shane A <shanea@allenai.org>
|
||||||
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||||
|
Shankar <gshankar.87@gmail.com>
|
||||||
|
Shanshan Shen <467638484@qq.com>
|
||||||
Shijie <821898965@qq.com>
|
Shijie <821898965@qq.com>
|
||||||
Shintarou Okada <kokuzen@gmail.com>
|
Shintarou Okada <kokuzen@gmail.com>
|
||||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||||
Shouzheng Liu <lshzh.hi@gmail.com>
|
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
||||||
|
Shupei Fan <dymarkfan@outlook.com>
|
||||||
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||||
Simon Willison <swillison@gmail.com>
|
Simon Willison <swillison@gmail.com>
|
||||||
Siwen Yu <yusiwen@gmail.com>
|
Siwen Yu <yusiwen@gmail.com>
|
||||||
Sky Yan <skyan83@gmail.com>
|
Sky Yan <skyan83@gmail.com>
|
||||||
Slaren <2141330+slaren@users.noreply.github.com>
|
Slaren <2141330+slaren@users.noreply.github.com>
|
||||||
Slava Primenko <primenko.s@gmail.com>
|
Slava Primenko <primenko.s@gmail.com>
|
||||||
|
Small Grass Forest <zixuanxcl@gmail.com>
|
||||||
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||||
Someone <sergei.kozlukov@aalto.fi>
|
Someone <sergei.kozlukov@aalto.fi>
|
||||||
Someone Serge <sergei.kozlukov@aalto.fi>
|
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||||
|
@ -491,12 +616,15 @@ Stefan Sydow <stefan@sydow.email>
|
||||||
Steffen Röcker <sroecker@gmail.com>
|
Steffen Röcker <sroecker@gmail.com>
|
||||||
Stephan Walter <stephan@walter.name>
|
Stephan Walter <stephan@walter.name>
|
||||||
Stephen Nichols <snichols@users.noreply.github.com>
|
Stephen Nichols <snichols@users.noreply.github.com>
|
||||||
|
Steve Bonds <sbonds@gmail.com>
|
||||||
Steve Grubb <ausearch.1@gmail.com>
|
Steve Grubb <ausearch.1@gmail.com>
|
||||||
Steven Prichard <spprichard20@gmail.com>
|
Steven Prichard <spprichard20@gmail.com>
|
||||||
Steven Roussey <sroussey@gmail.com>
|
Steven Roussey <sroussey@gmail.com>
|
||||||
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||||
|
StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
|
||||||
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||||
SuperUserNameMan <yoann@terminajones.com>
|
SuperUserNameMan <yoann@terminajones.com>
|
||||||
|
Sutou Kouhei <kou@cozmixng.org>
|
||||||
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||||
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||||
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||||
|
@ -507,7 +635,9 @@ Theia Vogel <theia@vgel.me>
|
||||||
Thérence <13496987+Royalphax@users.noreply.github.com>
|
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||||
Thibault Terrasson <thibault.terrasson@gmail.com>
|
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||||
Thomas Klausner <wiz@gatalith.at>
|
Thomas Klausner <wiz@gatalith.at>
|
||||||
|
Thorsten Sommer <SommerEngineering@users.noreply.github.com>
|
||||||
Tim Miller <drasticactions@users.noreply.github.com>
|
Tim Miller <drasticactions@users.noreply.github.com>
|
||||||
|
Tim Wang <overocean@gmail.com>
|
||||||
Timmy Knight <r2d2fish@gmail.com>
|
Timmy Knight <r2d2fish@gmail.com>
|
||||||
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||||
Ting Lou <ting.lou@gmail.com>
|
Ting Lou <ting.lou@gmail.com>
|
||||||
|
@ -517,24 +647,31 @@ Tom C <tom.corelis@gmail.com>
|
||||||
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||||
Tomas <tom.tomas.36478119@gmail.com>
|
Tomas <tom.tomas.36478119@gmail.com>
|
||||||
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||||
|
Tony Wasserka <4840017+neobrain@users.noreply.github.com>
|
||||||
Tristan Druyen <tristan@vault81.mozmail.com>
|
Tristan Druyen <tristan@vault81.mozmail.com>
|
||||||
Tristan Ross <rosscomputerguy@protonmail.com>
|
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||||
|
Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
|
||||||
Tungsten842 <886724vf@anonaddy.me>
|
Tungsten842 <886724vf@anonaddy.me>
|
||||||
Tungsten842 <quantmint@protonmail.com>
|
Tungsten842 <quantmint@protonmail.com>
|
||||||
Tushar <ditsuke@protonmail.com>
|
Tushar <ditsuke@protonmail.com>
|
||||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||||
|
Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
|
||||||
Ulrich Drepper <drepper@gmail.com>
|
Ulrich Drepper <drepper@gmail.com>
|
||||||
Uzo Nweke <uzoechi@gmail.com>
|
Uzo Nweke <uzoechi@gmail.com>
|
||||||
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||||
Val Kharitonov <mail@kharvd.com>
|
Val Kharitonov <mail@kharvd.com>
|
||||||
Valentin Konovalov <valle.ketsujin@gmail.com>
|
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||||
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||||
|
Vali Malinoiu <0x4139@gmail.com>
|
||||||
Victor Nogueira <felladrin@gmail.com>
|
Victor Nogueira <felladrin@gmail.com>
|
||||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||||
|
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
|
||||||
|
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
|
||||||
Vlad <spitfireage@gmail.com>
|
Vlad <spitfireage@gmail.com>
|
||||||
Vladimir <bogdad@gmail.com>
|
Vladimir <bogdad@gmail.com>
|
||||||
Vladimir Malyutin <first-leon@yandex.ru>
|
Vladimir Malyutin <first-leon@yandex.ru>
|
||||||
Vladimir Zorin <vladimir@deviant.guru>
|
Vladimir Zorin <vladimir@deviant.guru>
|
||||||
|
VoidIsVoid <343750470@qq.com>
|
||||||
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||||
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||||
Weird Constructor <weirdconstructor@gmail.com>
|
Weird Constructor <weirdconstructor@gmail.com>
|
||||||
|
@ -551,15 +688,22 @@ Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||||
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||||
XiaotaoChen <chenxiaotao1234@gmail.com>
|
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||||
Xiaoyi Chen <cxychina@gmail.com>
|
Xiaoyi Chen <cxychina@gmail.com>
|
||||||
|
Xie Yanbo <xieyanbo@gmail.com>
|
||||||
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||||
|
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
|
||||||
Xuan Son Nguyen <thichthat@gmail.com>
|
Xuan Son Nguyen <thichthat@gmail.com>
|
||||||
|
Yaiko <elyaiko@hotmail.com>
|
||||||
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||||
Yaroslav <yaroslav.yashin@me.com>
|
Yaroslav <yaroslav.yashin@me.com>
|
||||||
Yazan Agha-Schrader <mountaiin@icloud.com>
|
Yazan Agha-Schrader <mountaiin@icloud.com>
|
||||||
Yiming Cui <conandiy@vip.qq.com>
|
Yiming Cui <conandiy@vip.qq.com>
|
||||||
Yishuo Wang <MeouSker77@outlook.com>
|
Yishuo Wang <MeouSker77@outlook.com>
|
||||||
|
Yoshi Suhara <y.suhara@gmail.com>
|
||||||
|
Yoshi Suhara <ysuhara@nvidia.com>
|
||||||
|
Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
|
||||||
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||||
Yui <dev@sleepyyui.com>
|
Yui <dev@sleepyyui.com>
|
||||||
|
Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
|
||||||
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||||
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||||
ZHAOKAI WANG <sanxianwei@163.com>
|
ZHAOKAI WANG <sanxianwei@163.com>
|
||||||
|
@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||||
Zenix <zenixls2@gmail.com>
|
Zenix <zenixls2@gmail.com>
|
||||||
Zhang Peiyuan <a1286225768@gmail.com>
|
Zhang Peiyuan <a1286225768@gmail.com>
|
||||||
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
||||||
|
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
|
||||||
|
Zhiyuan Li <lizhiyuan@uniartisan.com>
|
||||||
ZhouYuChen <zhouyuchen@naver.com>
|
ZhouYuChen <zhouyuchen@naver.com>
|
||||||
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||||
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||||
|
@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||||
alonfaraj <alonfaraj@gmail.com>
|
alonfaraj <alonfaraj@gmail.com>
|
||||||
alwqx <kenan3015@gmail.com>
|
alwqx <kenan3015@gmail.com>
|
||||||
amd-lalithnc <lalithnc@amd.com>
|
amd-lalithnc <lalithnc@amd.com>
|
||||||
|
amritahs-ibm <amritahs@linux.vnet.ibm.com>
|
||||||
andrijdavid <david@geek.mg>
|
andrijdavid <david@geek.mg>
|
||||||
anon998 <131767832+anon998@users.noreply.github.com>
|
anon998 <131767832+anon998@users.noreply.github.com>
|
||||||
anzz1 <anzz1@live.com>
|
anzz1 <anzz1@live.com>
|
||||||
|
@ -588,14 +735,18 @@ apaz <aarpazdera@gmail.com>
|
||||||
apcameron <37645737+apcameron@users.noreply.github.com>
|
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||||
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
||||||
arcrank <arcrank@gmail.com>
|
arcrank <arcrank@gmail.com>
|
||||||
|
ardfork <134447697+ardfork@users.noreply.github.com>
|
||||||
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||||
at8u <129688334+at8u@users.noreply.github.com>
|
at8u <129688334+at8u@users.noreply.github.com>
|
||||||
automaticcat <daogiatuank54@gmail.com>
|
automaticcat <daogiatuank54@gmail.com>
|
||||||
|
awatuna <23447591+awatuna@users.noreply.github.com>
|
||||||
|
b4b4o <zwbao@foxmail.com>
|
||||||
bandoti <141645996+bandoti@users.noreply.github.com>
|
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||||
beiller <beiller@gmail.com>
|
beiller <beiller@gmail.com>
|
||||||
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||||
bmwl <brian.marshall@tolko.com>
|
bmwl <brian.marshall@tolko.com>
|
||||||
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||||
|
brucepro <git@brucepro.net>
|
||||||
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||||
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||||
bssrdf <merlintiger@hotmail.com>
|
bssrdf <merlintiger@hotmail.com>
|
||||||
|
@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com>
|
||||||
crasm <crasm@git.vczf.net>
|
crasm <crasm@git.vczf.net>
|
||||||
crasm <crasm@git.vczf.us>
|
crasm <crasm@git.vczf.us>
|
||||||
daboe01 <daboe01@googlemail.com>
|
daboe01 <daboe01@googlemail.com>
|
||||||
|
daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
|
||||||
|
daminho <37615795+daminho@users.noreply.github.com>
|
||||||
david raistrick <keen99@users.noreply.github.com>
|
david raistrick <keen99@users.noreply.github.com>
|
||||||
ddh0 <dylanhalladay02@icloud.com>
|
ddh0 <dylanhalladay02@icloud.com>
|
||||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||||
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||||
|
devojony <61173062+devojony@users.noreply.github.com>
|
||||||
|
ditsuke <ditsuke@protonmail.com>
|
||||||
divinity76 <divinity76@gmail.com>
|
divinity76 <divinity76@gmail.com>
|
||||||
dm4 <sunrisedm4@gmail.com>
|
dm4 <sunrisedm4@gmail.com>
|
||||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||||
|
@ -629,14 +784,18 @@ ebraminio <ebraminio@gmail.com>
|
||||||
eiery <19350831+eiery@users.noreply.github.com>
|
eiery <19350831+eiery@users.noreply.github.com>
|
||||||
eric8607242 <e0928021388@gmail.com>
|
eric8607242 <e0928021388@gmail.com>
|
||||||
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
||||||
|
fengerhu1 <2748250768@qq.com>
|
||||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||||
gliptic <gliptic@users.noreply.github.com>
|
gliptic <gliptic@users.noreply.github.com>
|
||||||
goerch <jhr.walter@t-online.de>
|
goerch <jhr.walter@t-online.de>
|
||||||
grahameth <96447521+grahameth@users.noreply.github.com>
|
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||||
|
gtygo <gtydoit@gmail.com>
|
||||||
gwjr <502526+gwjr@users.noreply.github.com>
|
gwjr <502526+gwjr@users.noreply.github.com>
|
||||||
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||||
hankcs <cnhankmc@gmail.com>
|
hankcs <cnhankmc@gmail.com>
|
||||||
|
haopeng <657407891@qq.com>
|
||||||
|
hipudding <huafengchun@gmail.com>
|
||||||
hoangmit <hoangmit@users.noreply.github.com>
|
hoangmit <hoangmit@users.noreply.github.com>
|
||||||
hongbo.mo <352280764@qq.com>
|
hongbo.mo <352280764@qq.com>
|
||||||
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
||||||
|
@ -649,12 +808,14 @@ hxer7963 <hxer7963@gmail.com>
|
||||||
hydai <z54981220@gmail.com>
|
hydai <z54981220@gmail.com>
|
||||||
iSma <ismail.senhaji@gmail.com>
|
iSma <ismail.senhaji@gmail.com>
|
||||||
iacore <74560659+iacore@users.noreply.github.com>
|
iacore <74560659+iacore@users.noreply.github.com>
|
||||||
|
icppWorld <124377669+icppWorld@users.noreply.github.com>
|
||||||
igarnier <igarnier@protonmail.com>
|
igarnier <igarnier@protonmail.com>
|
||||||
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
||||||
iohub <rickyang.pro@gmail.com>
|
iohub <rickyang.pro@gmail.com>
|
||||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||||
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
||||||
jameswu2014 <545426914@qq.com>
|
jameswu2014 <545426914@qq.com>
|
||||||
|
jdomke <28772296+jdomke@users.noreply.github.com>
|
||||||
jiez <373447296@qq.com>
|
jiez <373447296@qq.com>
|
||||||
jneem <joeneeman@gmail.com>
|
jneem <joeneeman@gmail.com>
|
||||||
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
||||||
|
@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com>
|
||||||
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||||
kunnis <kunnis@users.noreply.github.com>
|
kunnis <kunnis@users.noreply.github.com>
|
||||||
kuronekosaiko <EvanChanJ@163.com>
|
kuronekosaiko <EvanChanJ@163.com>
|
||||||
|
kustaaya <58045274+kustaaya@users.noreply.github.com>
|
||||||
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||||
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||||
l3utterfly <gc.pthzfoldr@gmail.com>
|
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||||
|
laik <laik.lj@me.com>
|
||||||
ldwang <ftgreat@163.com>
|
ldwang <ftgreat@163.com>
|
||||||
le.chang <cljs118@126.com>
|
le.chang <cljs118@126.com>
|
||||||
leejet <leejet714@gmail.com>
|
leejet <leejet714@gmail.com>
|
||||||
|
leo-pony <nengjunma@outlook.com>
|
||||||
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||||
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
||||||
lon <114724657+longregen@users.noreply.github.com>
|
lon <114724657+longregen@users.noreply.github.com>
|
||||||
loonerin <132926317+loonerin@users.noreply.github.com>
|
loonerin <132926317+loonerin@users.noreply.github.com>
|
||||||
|
ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
|
||||||
luoyu-intel <yu.luo@intel.com>
|
luoyu-intel <yu.luo@intel.com>
|
||||||
m3ndax <adrian.goessl@outlook.com>
|
m3ndax <adrian.goessl@outlook.com>
|
||||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||||
makomk <makosoft@googlemail.com>
|
makomk <makosoft@googlemail.com>
|
||||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||||
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
||||||
|
matiaslin <45382001+matiaslin@users.noreply.github.com>
|
||||||
|
matteo <matteogeniaccio@yahoo.it>
|
||||||
mdrokz <mohammadmunshi@gmail.com>
|
mdrokz <mohammadmunshi@gmail.com>
|
||||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||||
minarchist <minarchist@users.noreply.github.com>
|
minarchist <minarchist@users.noreply.github.com>
|
||||||
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||||
mmyjona <jonathan.gonse@gmail.com>
|
mmyjona <jonathan.gonse@gmail.com>
|
||||||
momonga <115213907+mmnga@users.noreply.github.com>
|
momonga <115213907+mmnga@users.noreply.github.com>
|
||||||
|
momonga <146910567+mmngays@users.noreply.github.com>
|
||||||
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||||
mzcu <milos.cubrilo@gmail.com>
|
mzcu <milos.cubrilo@gmail.com>
|
||||||
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||||
|
@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com>
|
||||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||||
opparco <parco.opaai@gmail.com>
|
opparco <parco.opaai@gmail.com>
|
||||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||||
|
pculliton <phillipculliton@gmail.com>
|
||||||
pengxin99 <pengxin.yuan@intel.com>
|
pengxin99 <pengxin.yuan@intel.com>
|
||||||
perserk <perserk@gmail.com>
|
perserk <perserk@gmail.com>
|
||||||
|
piDack <104877312+piDack@users.noreply.github.com>
|
||||||
pmysl <piotr.myslinski@outlook.com>
|
pmysl <piotr.myslinski@outlook.com>
|
||||||
postmasters <namnguyen@google.com>
|
postmasters <namnguyen@google.com>
|
||||||
pudepiedj <pudepiedj@gmail.com>
|
pudepiedj <pudepiedj@gmail.com>
|
||||||
|
@ -733,6 +903,7 @@ runfuture <runfuture@users.noreply.github.com>
|
||||||
sandyiscool <sandyiscool@gmail.com>
|
sandyiscool <sandyiscool@gmail.com>
|
||||||
sasha0552 <admin@sasha0552.org>
|
sasha0552 <admin@sasha0552.org>
|
||||||
semidark <me@semidark.net>
|
semidark <me@semidark.net>
|
||||||
|
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
|
||||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||||
shibe2 <shibe@tuta.io>
|
shibe2 <shibe@tuta.io>
|
||||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||||
|
@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
|
||||||
slaren <2141330+slaren@users.noreply.github.com>
|
slaren <2141330+slaren@users.noreply.github.com>
|
||||||
slaren <slarengh@gmail.com>
|
slaren <slarengh@gmail.com>
|
||||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||||
|
standby24x7 <standby24x7@gmail.com>
|
||||||
staviq <staviq@gmail.com>
|
staviq <staviq@gmail.com>
|
||||||
stduhpf <stephduh@live.fr>
|
stduhpf <stephduh@live.fr>
|
||||||
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
||||||
swittk <switt1995@gmail.com>
|
swittk <switt1995@gmail.com>
|
||||||
takov751 <40316768+takov751@users.noreply.github.com>
|
takov751 <40316768+takov751@users.noreply.github.com>
|
||||||
tarcey <cey.tarik@gmail.com>
|
tarcey <cey.tarik@gmail.com>
|
||||||
|
tc-mb <157115220+tc-mb@users.noreply.github.com>
|
||||||
texmex76 <40733439+texmex76@users.noreply.github.com>
|
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||||
thement <40525767+thement@users.noreply.github.com>
|
thement <40525767+thement@users.noreply.github.com>
|
||||||
|
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
|
||||||
tjohnman <tjohnman@users.noreply.github.com>
|
tjohnman <tjohnman@users.noreply.github.com>
|
||||||
|
toyer <2042519524@qq.com>
|
||||||
tslmy <tslmy@users.noreply.github.com>
|
tslmy <tslmy@users.noreply.github.com>
|
||||||
ubik2 <ubik2@users.noreply.github.com>
|
ubik2 <ubik2@users.noreply.github.com>
|
||||||
uint256_t <konndennsa@gmail.com>
|
uint256_t <konndennsa@gmail.com>
|
||||||
uint256_t <maekawatoshiki1017@gmail.com>
|
uint256_t <maekawatoshiki1017@gmail.com>
|
||||||
unbounded <haakon@likedan.net>
|
unbounded <haakon@likedan.net>
|
||||||
|
uvos <devnull@uvos.xyz>
|
||||||
valiray <133289098+valiray@users.noreply.github.com>
|
valiray <133289098+valiray@users.noreply.github.com>
|
||||||
|
vb <vaibhavs10@gmail.com>
|
||||||
vik <vikhyatk@gmail.com>
|
vik <vikhyatk@gmail.com>
|
||||||
viric <viric@viric.name>
|
viric <viric@viric.name>
|
||||||
vodkaslime <646329483@qq.com>
|
vodkaslime <646329483@qq.com>
|
||||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||||
|
wangshuai09 <391746016@qq.com>
|
||||||
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||||
whoreson <139810751+whoreson@users.noreply.github.com>
|
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||||
woachk <24752637+woachk@users.noreply.github.com>
|
woachk <24752637+woachk@users.noreply.github.com>
|
||||||
wonjun Jang <strutive07@gmail.com>
|
wonjun Jang <strutive07@gmail.com>
|
||||||
woodx <124784234+woodx9@users.noreply.github.com>
|
woodx <124784234+woodx9@users.noreply.github.com>
|
||||||
|
wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
|
||||||
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||||
xaedes <xaedes@gmail.com>
|
xaedes <xaedes@gmail.com>
|
||||||
xaedes <xaedes@googlemail.com>
|
xaedes <xaedes@googlemail.com>
|
||||||
|
xctan <axunlei@gmail.com>
|
||||||
xloem <0xloem@gmail.com>
|
xloem <0xloem@gmail.com>
|
||||||
yangli2 <yangli2@gmail.com>
|
yangli2 <yangli2@gmail.com>
|
||||||
yuiseki <yuiseki@gmail.com>
|
yuiseki <yuiseki@gmail.com>
|
||||||
|
yuri@FreeBSD <yurivict@users.noreply.github.com>
|
||||||
zakkor <edward.partenie@gmail.com>
|
zakkor <edward.partenie@gmail.com>
|
||||||
zhangkaihuo <zhangkaihuo@gmail.com>
|
zhangkaihuo <zhangkaihuo@gmail.com>
|
||||||
|
zhentaoyu <zhentao.yu@intel.com>
|
||||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||||
zhouwg <zhouwg2000@gmail.com>
|
zhouwg <zhouwg2000@gmail.com>
|
||||||
zrm <trustiosity.zrm@gmail.com>
|
zrm <trustiosity.zrm@gmail.com>
|
||||||
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
||||||
|
杨朱 · Kiki <baofa.fan@daocloud.io>
|
||||||
源文雨 <41315874+fumiama@users.noreply.github.com>
|
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||||
|
蕭澧邦 <45505768+shou692199@users.noreply.github.com>
|
||||||
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
||||||
|
|
|
@ -46,6 +46,13 @@ if (WIN32)
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# option list
|
# option list
|
||||||
#
|
#
|
||||||
|
@ -75,6 +82,7 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||||
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||||
|
|
||||||
# override ggml options
|
# override ggml options
|
||||||
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
||||||
|
@ -88,10 +96,6 @@ if (NOT DEFINED GGML_LLAMAFILE)
|
||||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
set(GGML_LLAMAFILE_DEFAULT ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED GGML_AMX)
|
|
||||||
set(GGML_AMX ON)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (NOT DEFINED GGML_CUDA_GRAPHS)
|
if (NOT DEFINED GGML_CUDA_GRAPHS)
|
||||||
set(GGML_CUDA_GRAPHS_DEFAULT ON)
|
set(GGML_CUDA_GRAPHS_DEFAULT ON)
|
||||||
endif()
|
endif()
|
||||||
|
@ -140,7 +144,6 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
|
||||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||||
|
|
||||||
|
|
||||||
# At the moment some compile definitions are placed within the ggml/src
|
# At the moment some compile definitions are placed within the ggml/src
|
||||||
# directory but not exported on the `ggml` target. This could be improved by
|
# directory but not exported on the `ggml` target. This could be improved by
|
||||||
# determining _precisely_ which defines are necessary for the llama-config
|
# determining _precisely_ which defines are necessary for the llama-config
|
||||||
|
@ -157,8 +160,11 @@ if (GGML_TARGET_DEFINES)
|
||||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
||||||
endif()
|
endif()
|
||||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||||
|
# all public headers
|
||||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
set(LLAMA_PUBLIC_HEADERS
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
|
||||||
|
set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
|
||||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||||
|
|
||||||
configure_package_config_file(
|
configure_package_config_file(
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
||||||
|
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
"name": "arm64-windows-msvc", "hidden": true,
|
||||||
|
@ -48,10 +49,23 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "arm64-apple-clang", "hidden": true,
|
||||||
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
|
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
||||||
|
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||||
|
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||||
|
@ -63,6 +77,9 @@
|
||||||
{ "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
|
{ "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
|
||||||
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
||||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
||||||
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
|
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
|
||||||
|
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
3
CODEOWNERS
Normal file
3
CODEOWNERS
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||||
|
|
||||||
|
ci/ @ggerganov
|
|
@ -1,9 +1,10 @@
|
||||||
# Pull requests (for contributors)
|
# Pull requests (for contributors)
|
||||||
|
|
||||||
- Test your changes:
|
- Test your changes:
|
||||||
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
|
|
||||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
|
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
||||||
|
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
||||||
|
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
||||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
||||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
||||||
|
|
||||||
|
@ -12,6 +13,7 @@
|
||||||
- Squash-merge PRs
|
- Squash-merge PRs
|
||||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
||||||
|
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
|
||||||
|
|
||||||
# Coding guidelines
|
# Coding guidelines
|
||||||
|
|
||||||
|
|
611
Makefile
611
Makefile
|
@ -1,7 +1,10 @@
|
||||||
|
ifndef LLAMA_MAKEFILE
|
||||||
|
$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||||
|
endif
|
||||||
|
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
libllava.a \
|
libllava.a \
|
||||||
llama-baby-llama \
|
|
||||||
llama-batched \
|
llama-batched \
|
||||||
llama-batched-bench \
|
llama-batched-bench \
|
||||||
llama-bench \
|
llama-bench \
|
||||||
|
@ -34,6 +37,8 @@ BUILD_TARGETS = \
|
||||||
llama-save-load-state \
|
llama-save-load-state \
|
||||||
llama-server \
|
llama-server \
|
||||||
llama-simple \
|
llama-simple \
|
||||||
|
llama-simple-chat \
|
||||||
|
llama-run \
|
||||||
llama-speculative \
|
llama-speculative \
|
||||||
llama-tokenize \
|
llama-tokenize \
|
||||||
llama-vdot \
|
llama-vdot \
|
||||||
|
@ -49,7 +54,6 @@ TEST_TARGETS = \
|
||||||
tests/test-backend-ops \
|
tests/test-backend-ops \
|
||||||
tests/test-chat-template \
|
tests/test-chat-template \
|
||||||
tests/test-double-float \
|
tests/test-double-float \
|
||||||
tests/test-grad0 \
|
|
||||||
tests/test-grammar-integration \
|
tests/test-grammar-integration \
|
||||||
tests/test-grammar-parser \
|
tests/test-grammar-parser \
|
||||||
tests/test-json-schema-to-grammar \
|
tests/test-json-schema-to-grammar \
|
||||||
|
@ -57,7 +61,6 @@ TEST_TARGETS = \
|
||||||
tests/test-llama-grammar \
|
tests/test-llama-grammar \
|
||||||
tests/test-log \
|
tests/test-log \
|
||||||
tests/test-model-load-cancel \
|
tests/test-model-load-cancel \
|
||||||
tests/test-opt \
|
|
||||||
tests/test-quantize-fns \
|
tests/test-quantize-fns \
|
||||||
tests/test-quantize-perf \
|
tests/test-quantize-perf \
|
||||||
tests/test-rope \
|
tests/test-rope \
|
||||||
|
@ -66,6 +69,7 @@ TEST_TARGETS = \
|
||||||
tests/test-tokenizer-0 \
|
tests/test-tokenizer-0 \
|
||||||
tests/test-tokenizer-1-bpe \
|
tests/test-tokenizer-1-bpe \
|
||||||
tests/test-tokenizer-1-spm
|
tests/test-tokenizer-1-spm
|
||||||
|
# tests/test-opt \
|
||||||
|
|
||||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
||||||
|
@ -254,11 +258,11 @@ endif
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
|
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++17
|
||||||
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
|
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
|
||||||
MK_CFLAGS = -std=c11 -fPIC
|
MK_CFLAGS = -std=c11 -fPIC
|
||||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
MK_CXXFLAGS = -std=c++17 -fPIC
|
||||||
MK_NVCCFLAGS = -std=c++11
|
MK_NVCCFLAGS = -std=c++17
|
||||||
|
|
||||||
ifdef LLAMA_NO_CCACHE
|
ifdef LLAMA_NO_CCACHE
|
||||||
GGML_NO_CCACHE := 1
|
GGML_NO_CCACHE := 1
|
||||||
|
@ -294,6 +298,7 @@ endif
|
||||||
# some memory allocation are available on Linux through GNU extensions in libc
|
# some memory allocation are available on Linux through GNU extensions in libc
|
||||||
ifeq ($(UNAME_S),Linux)
|
ifeq ($(UNAME_S),Linux)
|
||||||
MK_CPPFLAGS += -D_GNU_SOURCE
|
MK_CPPFLAGS += -D_GNU_SOURCE
|
||||||
|
MK_LDFLAGS += -ldl
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||||
|
@ -362,6 +367,10 @@ ifdef LLAMA_SERVER_SSL
|
||||||
MK_LDFLAGS += -lssl -lcrypto
|
MK_LDFLAGS += -lssl -lcrypto
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef GGML_NO_CPU_AARCH64
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
|
||||||
|
endif
|
||||||
|
|
||||||
# warnings
|
# warnings
|
||||||
WARN_FLAGS = \
|
WARN_FLAGS = \
|
||||||
-Wall \
|
-Wall \
|
||||||
|
@ -526,70 +535,62 @@ ifndef GGML_NO_ACCELERATE
|
||||||
# Mac OS - include Accelerate framework.
|
# Mac OS - include Accelerate framework.
|
||||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
|
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
|
||||||
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
||||||
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
||||||
MK_LDFLAGS += -framework Accelerate
|
MK_LDFLAGS += -framework Accelerate
|
||||||
OBJ_GGML += ggml/src/ggml-blas.o
|
OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
|
||||||
endif
|
endif
|
||||||
endif # GGML_NO_ACCELERATE
|
endif # GGML_NO_ACCELERATE
|
||||||
|
|
||||||
ifdef GGML_MUSA
|
|
||||||
CC := clang
|
|
||||||
CXX := clang++
|
|
||||||
GGML_CUDA := 1
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_MUSA
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifndef GGML_NO_OPENMP
|
ifndef GGML_NO_OPENMP
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
||||||
MK_CFLAGS += -fopenmp
|
MK_CFLAGS += -fopenmp
|
||||||
MK_CXXFLAGS += -fopenmp
|
MK_CXXFLAGS += -fopenmp
|
||||||
ifdef GGML_MUSA
|
|
||||||
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
|
|
||||||
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
|
|
||||||
endif # GGML_MUSA
|
|
||||||
endif # GGML_NO_OPENMP
|
endif # GGML_NO_OPENMP
|
||||||
|
|
||||||
ifdef GGML_OPENBLAS
|
ifdef GGML_OPENBLAS
|
||||||
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
|
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
|
||||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
||||||
OBJ_GGML += ggml/src/ggml-blas.o
|
OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
|
||||||
endif # GGML_OPENBLAS
|
endif # GGML_OPENBLAS
|
||||||
|
|
||||||
ifdef GGML_OPENBLAS64
|
ifdef GGML_OPENBLAS64
|
||||||
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
|
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
|
||||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
|
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
|
||||||
OBJ_GGML += ggml/src/ggml-blas.o
|
OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
|
||||||
endif # GGML_OPENBLAS64
|
endif # GGML_OPENBLAS64
|
||||||
|
|
||||||
ifdef GGML_BLIS
|
ifdef GGML_BLIS
|
||||||
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
|
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
|
||||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||||
OBJ_GGML += ggml/src/ggml-blas.o
|
OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
|
||||||
endif # GGML_BLIS
|
endif # GGML_BLIS
|
||||||
|
|
||||||
ifdef GGML_NVPL
|
ifdef GGML_NVPL
|
||||||
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
|
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
|
||||||
MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
|
MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
|
||||||
OBJ_GGML += ggml/src/ggml-blas.o
|
OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
|
||||||
endif # GGML_NVPL
|
endif # GGML_NVPL
|
||||||
|
|
||||||
ifndef GGML_NO_LLAMAFILE
|
ifndef GGML_NO_LLAMAFILE
|
||||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||||
OBJ_GGML += ggml/src/llamafile/sgemm.o
|
OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef GGML_NO_AMX
|
ifndef GGML_NO_AMX
|
||||||
MK_CPPFLAGS += -DGGML_USE_AMX
|
MK_CPPFLAGS += -DGGML_USE_AMX
|
||||||
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
|
OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# only necessary for the CPU backend files
|
||||||
|
MK_CPPFLAGS += -Iggml/src/ggml-cpu
|
||||||
|
|
||||||
ifdef GGML_RPC
|
ifdef GGML_RPC
|
||||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||||
OBJ_GGML += ggml/src/ggml-rpc.o
|
OBJ_GGML_EXT += ggml/src/ggml-rpc.o
|
||||||
endif # GGML_RPC
|
endif # GGML_RPC
|
||||||
|
|
||||||
OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
|
OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
|
||||||
|
@ -604,17 +605,6 @@ else
|
||||||
endif # GGML_CUDA_FA_ALL_QUANTS
|
endif # GGML_CUDA_FA_ALL_QUANTS
|
||||||
|
|
||||||
ifdef GGML_CUDA
|
ifdef GGML_CUDA
|
||||||
ifdef GGML_MUSA
|
|
||||||
ifneq ('', '$(wildcard /opt/musa)')
|
|
||||||
CUDA_PATH ?= /opt/musa
|
|
||||||
else
|
|
||||||
CUDA_PATH ?= /usr/local/musa
|
|
||||||
endif
|
|
||||||
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
|
|
||||||
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
|
|
||||||
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
|
|
||||||
else
|
|
||||||
ifneq ('', '$(wildcard /opt/cuda)')
|
ifneq ('', '$(wildcard /opt/cuda)')
|
||||||
CUDA_PATH ?= /opt/cuda
|
CUDA_PATH ?= /opt/cuda
|
||||||
else
|
else
|
||||||
|
@ -624,21 +614,18 @@ ifdef GGML_CUDA
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
||||||
MK_NVCCFLAGS += -use_fast_math
|
MK_NVCCFLAGS += -use_fast_math
|
||||||
endif # GGML_MUSA
|
|
||||||
|
|
||||||
OBJ_GGML += ggml/src/ggml-cuda.o
|
OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
|
||||||
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
||||||
OBJ_GGML += $(OBJ_CUDA_TMPL)
|
OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
|
||||||
|
|
||||||
ifdef LLAMA_FATAL_WARNINGS
|
ifdef LLAMA_FATAL_WARNINGS
|
||||||
MK_NVCCFLAGS += -Werror all-warnings
|
MK_NVCCFLAGS += -Werror all-warnings
|
||||||
endif # LLAMA_FATAL_WARNINGS
|
endif # LLAMA_FATAL_WARNINGS
|
||||||
|
|
||||||
ifndef GGML_MUSA
|
|
||||||
ifndef JETSON_EOL_MODULE_DETECT
|
ifndef JETSON_EOL_MODULE_DETECT
|
||||||
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
||||||
endif # JETSON_EOL_MODULE_DETECT
|
endif # JETSON_EOL_MODULE_DETECT
|
||||||
endif # GGML_MUSA
|
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
MK_NVCCFLAGS += -lineinfo
|
MK_NVCCFLAGS += -lineinfo
|
||||||
|
@ -650,12 +637,8 @@ endif # GGML_CUDA_DEBUG
|
||||||
|
|
||||||
ifdef GGML_CUDA_NVCC
|
ifdef GGML_CUDA_NVCC
|
||||||
NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
|
NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
|
||||||
else
|
|
||||||
ifdef GGML_MUSA
|
|
||||||
NVCC = $(CCACHE) mcc
|
|
||||||
else
|
else
|
||||||
NVCC = $(CCACHE) nvcc
|
NVCC = $(CCACHE) nvcc
|
||||||
endif # GGML_MUSA
|
|
||||||
endif # GGML_CUDA_NVCC
|
endif # GGML_CUDA_NVCC
|
||||||
|
|
||||||
ifdef CUDA_DOCKER_ARCH
|
ifdef CUDA_DOCKER_ARCH
|
||||||
|
@ -664,10 +647,6 @@ else ifndef CUDA_POWER_ARCH
|
||||||
MK_NVCCFLAGS += -arch=native
|
MK_NVCCFLAGS += -arch=native
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
|
|
||||||
ifdef GGML_CUDA_FORCE_DMMV
|
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
|
||||||
endif # GGML_CUDA_FORCE_DMMV
|
|
||||||
|
|
||||||
ifdef GGML_CUDA_FORCE_MMQ
|
ifdef GGML_CUDA_FORCE_MMQ
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
endif # GGML_CUDA_FORCE_MMQ
|
endif # GGML_CUDA_FORCE_MMQ
|
||||||
|
@ -676,20 +655,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
|
||||||
endif # GGML_CUDA_FORCE_CUBLAS
|
endif # GGML_CUDA_FORCE_CUBLAS
|
||||||
|
|
||||||
ifdef GGML_CUDA_DMMV_X
|
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
|
|
||||||
else
|
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
|
||||||
endif # GGML_CUDA_DMMV_X
|
|
||||||
|
|
||||||
ifdef GGML_CUDA_MMV_Y
|
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
|
|
||||||
else ifdef GGML_CUDA_DMMV_Y
|
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
|
|
||||||
else
|
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
|
||||||
endif # GGML_CUDA_MMV_Y
|
|
||||||
|
|
||||||
ifdef GGML_CUDA_F16
|
ifdef GGML_CUDA_F16
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
||||||
endif # GGML_CUDA_F16
|
endif # GGML_CUDA_F16
|
||||||
|
@ -698,12 +663,6 @@ ifdef GGML_CUDA_DMMV_F16
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
MK_NVCCFLAGS += -DGGML_CUDA_F16
|
||||||
endif # GGML_CUDA_DMMV_F16
|
endif # GGML_CUDA_DMMV_F16
|
||||||
|
|
||||||
ifdef GGML_CUDA_KQUANTS_ITER
|
|
||||||
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
|
|
||||||
else
|
|
||||||
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
|
||||||
else
|
else
|
||||||
|
@ -725,17 +684,11 @@ endif # GGML_CUDA_FA_ALL_QUANTS
|
||||||
ifdef JETSON_EOL_MODULE_DETECT
|
ifdef JETSON_EOL_MODULE_DETECT
|
||||||
define NVCC_COMPILE
|
define NVCC_COMPILE
|
||||||
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||||
endef # NVCC_COMPILE
|
|
||||||
else
|
|
||||||
ifdef GGML_MUSA
|
|
||||||
define NVCC_COMPILE
|
|
||||||
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
|
|
||||||
endef # NVCC_COMPILE
|
endef # NVCC_COMPILE
|
||||||
else
|
else
|
||||||
define NVCC_COMPILE
|
define NVCC_COMPILE
|
||||||
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||||
endef # NVCC_COMPILE
|
endef # NVCC_COMPILE
|
||||||
endif # GGML_MUSA
|
|
||||||
endif # JETSON_EOL_MODULE_DETECT
|
endif # JETSON_EOL_MODULE_DETECT
|
||||||
|
|
||||||
ggml/src/ggml-cuda/%.o: \
|
ggml/src/ggml-cuda/%.o: \
|
||||||
|
@ -745,8 +698,8 @@ ggml/src/ggml-cuda/%.o: \
|
||||||
ggml/src/ggml-cuda/common.cuh
|
ggml/src/ggml-cuda/common.cuh
|
||||||
$(NVCC_COMPILE)
|
$(NVCC_COMPILE)
|
||||||
|
|
||||||
ggml/src/ggml-cuda.o: \
|
ggml/src/ggml-cuda/ggml-cuda.o: \
|
||||||
ggml/src/ggml-cuda.cu \
|
ggml/src/ggml-cuda/ggml-cuda.cu \
|
||||||
ggml/include/ggml-cuda.h \
|
ggml/include/ggml-cuda.h \
|
||||||
ggml/include/ggml.h \
|
ggml/include/ggml.h \
|
||||||
ggml/include/ggml-backend.h \
|
ggml/include/ggml-backend.h \
|
||||||
|
@ -759,7 +712,7 @@ endif # GGML_CUDA
|
||||||
ifdef GGML_VULKAN
|
ifdef GGML_VULKAN
|
||||||
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
|
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
|
||||||
OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
|
OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
|
||||||
|
|
||||||
ifdef GGML_VULKAN_CHECK_RESULTS
|
ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||||
|
@ -789,10 +742,10 @@ GLSLC_CMD = glslc
|
||||||
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
|
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
|
||||||
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
|
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
|
||||||
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
|
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
|
||||||
_ggml_vk_input_dir = ggml/src/vulkan-shaders
|
_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
|
||||||
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
|
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
|
||||||
|
|
||||||
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
|
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
|
||||||
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
|
||||||
|
|
||||||
$(_ggml_vk_header): $(_ggml_vk_source)
|
$(_ggml_vk_header): $(_ggml_vk_source)
|
||||||
|
@ -804,12 +757,12 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
|
||||||
--target-hpp $(_ggml_vk_header) \
|
--target-hpp $(_ggml_vk_header) \
|
||||||
--target-cpp $(_ggml_vk_source)
|
--target-cpp $(_ggml_vk_source)
|
||||||
|
|
||||||
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
|
|
||||||
endif # GGML_VULKAN
|
endif # GGML_VULKAN
|
||||||
|
|
||||||
ifdef GGML_HIPBLAS
|
ifdef GGML_HIP
|
||||||
ifeq ($(wildcard /opt/rocm),)
|
ifeq ($(wildcard /opt/rocm),)
|
||||||
ROCM_PATH ?= /usr
|
ROCM_PATH ?= /usr
|
||||||
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
||||||
|
@ -818,11 +771,7 @@ ifdef GGML_HIPBLAS
|
||||||
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
GGML_CUDA_DMMV_X ?= 32
|
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
|
||||||
GGML_CUDA_MMV_Y ?= 1
|
|
||||||
GGML_CUDA_KQUANTS_ITER ?= 2
|
|
||||||
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
|
||||||
|
|
||||||
ifdef GGML_HIP_UMA
|
ifdef GGML_HIP_UMA
|
||||||
MK_CPPFLAGS += -DGGML_HIP_UMA
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
||||||
|
@ -835,13 +784,6 @@ endif # GGML_HIP_UMA
|
||||||
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
|
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
|
||||||
|
|
||||||
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
||||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
|
|
||||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
|
|
||||||
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
|
|
||||||
|
|
||||||
ifdef GGML_CUDA_FORCE_DMMV
|
|
||||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
|
||||||
endif # GGML_CUDA_FORCE_DMMV
|
|
||||||
|
|
||||||
ifdef GGML_CUDA_FORCE_MMQ
|
ifdef GGML_CUDA_FORCE_MMQ
|
||||||
HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
|
HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
|
@ -855,12 +797,12 @@ ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
||||||
endif # GGML_CUDA_NO_PEER_COPY
|
endif # GGML_CUDA_NO_PEER_COPY
|
||||||
|
|
||||||
OBJ_GGML += ggml/src/ggml-cuda.o
|
OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
|
||||||
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
||||||
OBJ_GGML += $(OBJ_CUDA_TMPL)
|
OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
|
||||||
|
|
||||||
ggml/src/ggml-cuda.o: \
|
ggml/src/ggml-cuda/ggml-cuda.o: \
|
||||||
ggml/src/ggml-cuda.cu \
|
ggml/src/ggml-cuda/ggml-cuda.cu \
|
||||||
ggml/include/ggml-cuda.h \
|
ggml/include/ggml-cuda.h \
|
||||||
ggml/include/ggml.h \
|
ggml/include/ggml.h \
|
||||||
ggml/include/ggml-backend.h \
|
ggml/include/ggml-backend.h \
|
||||||
|
@ -875,39 +817,126 @@ ggml/src/ggml-cuda/%.o: \
|
||||||
ggml/src/ggml-common.h \
|
ggml/src/ggml-common.h \
|
||||||
ggml/src/ggml-cuda/common.cuh
|
ggml/src/ggml-cuda/common.cuh
|
||||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||||
endif # GGML_HIPBLAS
|
endif # GGML_HIP
|
||||||
|
|
||||||
|
ifdef GGML_MUSA
|
||||||
|
ifeq ($(wildcard /opt/musa),)
|
||||||
|
MUSA_PATH ?= /usr/local/musa
|
||||||
|
else
|
||||||
|
MUSA_PATH ?= /opt/musa
|
||||||
|
endif
|
||||||
|
MUSA_ARCHITECTURES ?= 21;22
|
||||||
|
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
|
||||||
|
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
|
||||||
|
MK_LDFLAGS += -lmusa -lmusart -lmublas
|
||||||
|
|
||||||
|
ifndef GGML_NO_OPENMP
|
||||||
|
# For Ubuntu Focal
|
||||||
|
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
|
||||||
|
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
|
||||||
|
# For Ubuntu Jammy
|
||||||
|
MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
|
||||||
|
MK_LDFLAGS += -L/usr/lib/llvm-14/lib
|
||||||
|
endif # GGML_NO_OPENMP
|
||||||
|
|
||||||
|
CC := $(MUSA_PATH)/bin/clang
|
||||||
|
CXX := $(MUSA_PATH)/bin/clang++
|
||||||
|
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
|
||||||
|
|
||||||
|
MUSAFLAGS = -x musa -mtgpu
|
||||||
|
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
|
||||||
|
|
||||||
|
ifdef GGML_CUDA_FORCE_MMQ
|
||||||
|
MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
|
endif # GGML_CUDA_FORCE_MMQ
|
||||||
|
|
||||||
|
ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
|
MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
|
||||||
|
endif # GGML_CUDA_FORCE_CUBLAS
|
||||||
|
|
||||||
|
ifdef GGML_CUDA_F16
|
||||||
|
MUSAFLAGS += -DGGML_CUDA_F16
|
||||||
|
endif # GGML_CUDA_F16
|
||||||
|
|
||||||
|
ifdef GGML_CUDA_DMMV_F16
|
||||||
|
MUSAFLAGS += -DGGML_CUDA_F16
|
||||||
|
endif # GGML_CUDA_DMMV_F16
|
||||||
|
|
||||||
|
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||||
|
MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
|
||||||
|
else
|
||||||
|
MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
||||||
|
endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||||
|
|
||||||
|
ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
|
MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
||||||
|
endif # GGML_CUDA_NO_PEER_COPY
|
||||||
|
|
||||||
|
ifdef GGML_CUDA_FA_ALL_QUANTS
|
||||||
|
MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
|
||||||
|
endif # GGML_CUDA_FA_ALL_QUANTS
|
||||||
|
|
||||||
|
OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
|
||||||
|
OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
||||||
|
OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
|
||||||
|
|
||||||
|
ggml/src/ggml-cuda/ggml-cuda.o: \
|
||||||
|
ggml/src/ggml-cuda/ggml-cuda.cu \
|
||||||
|
ggml/include/ggml-cuda.h \
|
||||||
|
ggml/include/ggml.h \
|
||||||
|
ggml/include/ggml-backend.h \
|
||||||
|
ggml/src/ggml-backend-impl.h \
|
||||||
|
ggml/src/ggml-common.h \
|
||||||
|
$(wildcard ggml/src/ggml-cuda/*.cuh)
|
||||||
|
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
|
||||||
|
|
||||||
|
ggml/src/ggml-cuda/%.o: \
|
||||||
|
ggml/src/ggml-cuda/%.cu \
|
||||||
|
ggml/include/ggml.h \
|
||||||
|
ggml/src/ggml-common.h \
|
||||||
|
ggml/src/ggml-cuda/common.cuh
|
||||||
|
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
|
||||||
|
endif # GGML_MUSA
|
||||||
|
|
||||||
ifdef GGML_METAL
|
ifdef GGML_METAL
|
||||||
MK_CPPFLAGS += -DGGML_USE_METAL
|
MK_CPPFLAGS += -DGGML_USE_METAL
|
||||||
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
||||||
OBJ_GGML += ggml/src/ggml-metal.o
|
OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o
|
||||||
|
|
||||||
|
ifdef GGML_METAL_USE_BF16
|
||||||
|
MK_CPPFLAGS += -DGGML_METAL_USE_BF16
|
||||||
|
endif # GGML_METAL_USE_BF16
|
||||||
ifdef GGML_METAL_NDEBUG
|
ifdef GGML_METAL_NDEBUG
|
||||||
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
||||||
endif
|
endif
|
||||||
ifdef GGML_METAL_EMBED_LIBRARY
|
ifdef GGML_METAL_EMBED_LIBRARY
|
||||||
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
||||||
OBJ_GGML += ggml/src/ggml-metal-embed.o
|
OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o
|
||||||
endif
|
endif
|
||||||
endif # GGML_METAL
|
endif # GGML_METAL
|
||||||
|
|
||||||
ifdef GGML_METAL
|
ifdef GGML_METAL
|
||||||
ggml/src/ggml-metal.o: \
|
ggml/src/ggml-metal/ggml-metal.o: \
|
||||||
ggml/src/ggml-metal.m \
|
ggml/src/ggml-metal/ggml-metal.m \
|
||||||
|
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||||
ggml/include/ggml-metal.h \
|
ggml/include/ggml-metal.h \
|
||||||
ggml/include/ggml.h
|
ggml/include/ggml.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ifdef GGML_METAL_EMBED_LIBRARY
|
ifdef GGML_METAL_EMBED_LIBRARY
|
||||||
ggml/src/ggml-metal-embed.o: \
|
ggml/src/ggml-metal-embed.o: \
|
||||||
ggml/src/ggml-metal.metal \
|
ggml/src/ggml-metal/ggml-metal.metal \
|
||||||
|
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||||
ggml/src/ggml-common.h
|
ggml/src/ggml-common.h
|
||||||
@echo "Embedding Metal library"
|
@echo "Embedding Metal library"
|
||||||
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
|
@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
|
||||||
|
@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
|
||||||
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
|
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
|
||||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||||
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||||
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
|
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
|
||||||
|
@ -916,32 +945,44 @@ ggml/src/ggml-metal-embed.o: \
|
||||||
endif
|
endif
|
||||||
endif # GGML_METAL
|
endif # GGML_METAL
|
||||||
|
|
||||||
OBJ_GGML += \
|
DIR_GGML = ggml
|
||||||
ggml/src/ggml.o \
|
DIR_LLAMA = src
|
||||||
ggml/src/ggml-alloc.o \
|
DIR_COMMON = common
|
||||||
ggml/src/ggml-backend.o \
|
|
||||||
ggml/src/ggml-quants.o \
|
OBJ_GGML = \
|
||||||
ggml/src/ggml-aarch64.o
|
$(DIR_GGML)/src/ggml.o \
|
||||||
|
$(DIR_GGML)/src/ggml-aarch64.o \
|
||||||
|
$(DIR_GGML)/src/ggml-alloc.o \
|
||||||
|
$(DIR_GGML)/src/ggml-backend.o \
|
||||||
|
$(DIR_GGML)/src/ggml-backend-reg.o \
|
||||||
|
$(DIR_GGML)/src/ggml-opt.o \
|
||||||
|
$(DIR_GGML)/src/ggml-quants.o \
|
||||||
|
$(DIR_GGML)/src/ggml-threading.o \
|
||||||
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
|
||||||
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
|
||||||
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
|
||||||
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
|
||||||
|
$(OBJ_GGML_EXT)
|
||||||
|
|
||||||
OBJ_LLAMA = \
|
OBJ_LLAMA = \
|
||||||
src/llama.o \
|
$(DIR_LLAMA)/llama.o \
|
||||||
src/llama-vocab.o \
|
$(DIR_LLAMA)/llama-vocab.o \
|
||||||
src/llama-grammar.o \
|
$(DIR_LLAMA)/llama-grammar.o \
|
||||||
src/llama-sampling.o \
|
$(DIR_LLAMA)/llama-sampling.o \
|
||||||
src/unicode.o \
|
$(DIR_LLAMA)/unicode.o \
|
||||||
src/unicode-data.o
|
$(DIR_LLAMA)/unicode-data.o
|
||||||
|
|
||||||
OBJ_COMMON = \
|
OBJ_COMMON = \
|
||||||
common/common.o \
|
$(DIR_COMMON)/common.o \
|
||||||
common/arg.o \
|
$(DIR_COMMON)/arg.o \
|
||||||
common/log.o \
|
$(DIR_COMMON)/log.o \
|
||||||
common/console.o \
|
$(DIR_COMMON)/console.o \
|
||||||
common/ngram-cache.o \
|
$(DIR_COMMON)/ngram-cache.o \
|
||||||
common/sampling.o \
|
$(DIR_COMMON)/sampling.o \
|
||||||
common/train.o \
|
$(DIR_COMMON)/speculative.o \
|
||||||
common/build-info.o \
|
$(DIR_COMMON)/build-info.o \
|
||||||
common/json-schema-to-grammar.o \
|
$(DIR_COMMON)/tool-call.o \
|
||||||
common/tool-call.o
|
$(DIR_COMMON)/json-schema-to-grammar.o
|
||||||
|
|
||||||
OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
|
OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
|
||||||
|
|
||||||
|
@ -997,7 +1038,6 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||||
ifdef GGML_CUDA
|
ifdef GGML_CUDA
|
||||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||||
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||||
ifndef GGML_MUSA
|
|
||||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||||
|
|
||||||
ifndef CUDA_DOCKER_ARCH
|
ifndef CUDA_DOCKER_ARCH
|
||||||
|
@ -1007,7 +1047,6 @@ endif # CUDA_POWER_ARCH
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
|
|
||||||
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||||
endif # GGML_MUSA
|
|
||||||
endif # GGML_CUDA
|
endif # GGML_CUDA
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
|
@ -1044,232 +1083,85 @@ endif
|
||||||
# Build libraries
|
# Build libraries
|
||||||
#
|
#
|
||||||
|
|
||||||
# ggml
|
# Libraries
|
||||||
|
LIB_GGML = libggml.so
|
||||||
|
LIB_GGML_S = libggml.a
|
||||||
|
|
||||||
ggml/src/ggml.o: \
|
LIB_LLAMA = libllama.so
|
||||||
ggml/src/ggml.c \
|
LIB_LLAMA_S = libllama.a
|
||||||
ggml/include/ggml.h
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
ggml/src/ggml-alloc.o: \
|
LIB_COMMON = libcommon.so
|
||||||
ggml/src/ggml-alloc.c \
|
LIB_COMMON_S = libcommon.a
|
||||||
ggml/include/ggml.h \
|
|
||||||
ggml/include/ggml-alloc.h
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
ggml/src/ggml-backend.o: \
|
# Targets
|
||||||
ggml/src/ggml-backend.cpp \
|
BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
|
||||||
ggml/src/ggml-backend-impl.h \
|
|
||||||
ggml/include/ggml.h \
|
|
||||||
ggml/include/ggml-backend.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
ggml/src/ggml-quants.o: \
|
# Dependency files
|
||||||
ggml/src/ggml-quants.c \
|
DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
|
||||||
ggml/include/ggml.h \
|
|
||||||
ggml/src/ggml-quants.h \
|
|
||||||
ggml/src/ggml-common.h
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
ggml/src/ggml-aarch64.o: \
|
# Default target
|
||||||
ggml/src/ggml-aarch64.c \
|
all: $(BUILD_TARGETS)
|
||||||
ggml/include/ggml.h \
|
|
||||||
ggml/src/ggml-aarch64.h \
|
|
||||||
ggml/src/ggml-common.h
|
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
ggml/src/ggml-blas.o: \
|
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
|
||||||
ggml/src/ggml-blas.cpp \
|
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
|
||||||
ggml/include/ggml-blas.h
|
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
ggml/src/ggml-cpu/ggml-cpu.cpp \
|
||||||
|
ggml/include/ggml-backend.h \
|
||||||
ifndef GGML_NO_LLAMAFILE
|
|
||||||
ggml/src/llamafile/sgemm.o: \
|
|
||||||
ggml/src/llamafile/sgemm.cpp \
|
|
||||||
ggml/src/llamafile/sgemm.h \
|
|
||||||
ggml/include/ggml.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
endif # GGML_NO_LLAMAFILE
|
|
||||||
|
|
||||||
ifndef GGML_NO_AMX
|
|
||||||
ggml/src/ggml-amx.o: \
|
|
||||||
ggml/src/ggml-amx.cpp \
|
|
||||||
ggml/include/ggml-amx.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
ggml/src/ggml-amx/mmq.o: \
|
|
||||||
ggml/src/ggml-amx/mmq.cpp \
|
|
||||||
ggml/src/ggml-amx/mmq.h \
|
|
||||||
ggml/include/ggml.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifdef GGML_RPC
|
|
||||||
ggml/src/ggml-rpc.o: \
|
|
||||||
ggml/src/ggml-rpc.cpp \
|
|
||||||
ggml/include/ggml-rpc.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
endif # GGML_RPC
|
|
||||||
|
|
||||||
$(LIB_GGML): \
|
|
||||||
$(OBJ_GGML)
|
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
|
||||||
|
|
||||||
$(LIB_GGML_S): \
|
|
||||||
$(OBJ_GGML)
|
|
||||||
ar rcs $(LIB_GGML_S) $^
|
|
||||||
|
|
||||||
# llama
|
|
||||||
|
|
||||||
src/unicode.o: \
|
|
||||||
src/unicode.cpp \
|
|
||||||
src/unicode.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
src/unicode-data.o: \
|
|
||||||
src/unicode-data.cpp \
|
|
||||||
src/unicode-data.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
src/llama.o: \
|
|
||||||
src/llama.cpp \
|
|
||||||
src/llama-impl.h \
|
|
||||||
src/llama-vocab.h \
|
|
||||||
src/llama-grammar.h \
|
|
||||||
src/llama-sampling.h \
|
|
||||||
src/unicode.h \
|
|
||||||
include/llama.h \
|
|
||||||
ggml/include/ggml-cuda.h \
|
|
||||||
ggml/include/ggml-metal.h \
|
|
||||||
ggml/include/ggml.h \
|
ggml/include/ggml.h \
|
||||||
ggml/include/ggml-alloc.h \
|
ggml/include/ggml-alloc.h \
|
||||||
ggml/include/ggml-backend.h
|
ggml/src/ggml-backend-impl.h \
|
||||||
|
ggml/include/ggml-cpu.h \
|
||||||
|
ggml/src/ggml-impl.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
src/llama-vocab.o: \
|
# Rules for building object files
|
||||||
src/llama-vocab.cpp \
|
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
|
||||||
src/llama-vocab.h \
|
$(CC) $(CFLAGS) -MMD -c $< -o $@
|
||||||
src/llama-impl.h \
|
|
||||||
include/llama.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
src/llama-grammar.o: \
|
$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp
|
||||||
src/llama-grammar.cpp \
|
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
|
||||||
src/llama-grammar.h \
|
|
||||||
src/llama-impl.h \
|
|
||||||
src/llama-vocab.h \
|
|
||||||
src/llama-sampling.h \
|
|
||||||
include/llama.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
src/llama-sampling.o: \
|
$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp
|
||||||
src/llama-sampling.cpp \
|
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
|
||||||
src/llama-sampling.h \
|
|
||||||
src/llama-impl.h \
|
|
||||||
include/llama.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
$(LIB_LLAMA): \
|
$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp
|
||||||
$(OBJ_LLAMA) \
|
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
|
||||||
$(LIB_GGML)
|
|
||||||
|
# Rules for building libraries
|
||||||
|
$(LIB_GGML): $(OBJ_GGML)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
$(LIB_LLAMA_S): \
|
$(LIB_GGML_S): $(OBJ_GGML)
|
||||||
$(OBJ_LLAMA)
|
ar rcs $(LIB_GGML_S) $^
|
||||||
|
|
||||||
|
$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
|
||||||
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
|
$(LIB_LLAMA_S): $(OBJ_LLAMA)
|
||||||
ar rcs $(LIB_LLAMA_S) $^
|
ar rcs $(LIB_LLAMA_S) $^
|
||||||
|
|
||||||
# common
|
$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
|
||||||
|
|
||||||
common/common.o: \
|
|
||||||
common/common.cpp \
|
|
||||||
common/common.h \
|
|
||||||
common/chat-template.hpp \
|
|
||||||
common/console.h \
|
|
||||||
common/sampling.h \
|
|
||||||
common/json.hpp \
|
|
||||||
common/json-schema-to-grammar.h \
|
|
||||||
common/minja.hpp \
|
|
||||||
common/tool-call.cpp \
|
|
||||||
common/tool-call.h \
|
|
||||||
include/llama.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/arg.o: \
|
|
||||||
common/arg.cpp \
|
|
||||||
common/arg.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/log.o: \
|
|
||||||
common/log.cpp \
|
|
||||||
common/log.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/sampling.o: \
|
|
||||||
common/sampling.cpp \
|
|
||||||
common/sampling.h \
|
|
||||||
include/llama.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/console.o: \
|
|
||||||
common/console.cpp \
|
|
||||||
common/console.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/json-schema-to-grammar.o: \
|
|
||||||
common/json-schema-to-grammar.cpp \
|
|
||||||
common/json-schema-to-grammar.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/tool-call.o: \
|
|
||||||
common/tool-call.cpp \
|
|
||||||
common/tool-call.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/train.o: \
|
|
||||||
common/train.cpp \
|
|
||||||
common/train.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/ngram-cache.o: \
|
|
||||||
common/ngram-cache.cpp \
|
|
||||||
common/ngram-cache.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
$(LIB_COMMON): \
|
|
||||||
$(OBJ_COMMON) \
|
|
||||||
$(LIB_LLAMA) \
|
|
||||||
$(LIB_GGML)
|
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
$(LIB_COMMON_S): \
|
$(LIB_COMMON_S): $(OBJ_COMMON)
|
||||||
$(OBJ_COMMON)
|
|
||||||
ar rcs $(LIB_COMMON_S) $^
|
ar rcs $(LIB_COMMON_S) $^
|
||||||
|
|
||||||
clean:
|
# Include dependency files
|
||||||
rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
|
-include $(DEP_FILES)
|
||||||
rm -rvf src/*.o
|
|
||||||
rm -rvf tests/*.o
|
# Clean generated server assets
|
||||||
rm -rvf examples/*.o
|
clean-server-assets:
|
||||||
rm -rvf common/*.o
|
find examples/server -type f -name "*.js.hpp" -delete
|
||||||
rm -rvf *.a
|
find examples/server -type f -name "*.mjs.hpp" -delete
|
||||||
rm -rvf *.dll
|
find examples/server -type f -name "*.css.hpp" -delete
|
||||||
rm -rvf *.so
|
find examples/server -type f -name "*.html.hpp" -delete
|
||||||
rm -rvf *.dot
|
|
||||||
rm -rvf ggml/*.a
|
# Clean rule
|
||||||
rm -rvf ggml/*.dll
|
clean: clean-server-assets
|
||||||
rm -rvf ggml/*.so
|
rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||||
rm -vrf ggml/src/*.o
|
rm -rvf *.a *.dll *.so *.dot
|
||||||
rm -rvf ggml/src/llamafile/*.o
|
find ggml src common tests examples pocs -type f -name "*.o" -delete
|
||||||
rm -rvf common/build-info.cpp
|
find ggml src common tests examples pocs -type f -name "*.d" -delete
|
||||||
rm -vrf ggml/src/ggml-metal-embed.metal
|
|
||||||
rm -vrf ggml/src/ggml-cuda/*.o
|
|
||||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
|
||||||
rm -vrf ggml/src/ggml-amx/*.o
|
|
||||||
rm -rvf $(BUILD_TARGETS)
|
|
||||||
rm -rvf $(TEST_TARGETS)
|
|
||||||
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
|
|
||||||
rm -rvf $(LEGACY_TARGETS_CLEAN)
|
|
||||||
find examples pocs -type f -name "*.o" -delete
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
|
@ -1295,11 +1187,21 @@ llama-infill: examples/infill/infill.cpp \
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-run: examples/run/run.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-simple: examples/simple/simple.cpp \
|
llama-simple: examples/simple/simple.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-simple-chat: examples/simple-chat/simple-chat.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
llama-tokenize: examples/tokenize/tokenize.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
@ -1397,11 +1299,6 @@ llama-bench: examples/llama-bench/llama-bench.cpp \
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-baby-llama: examples/baby-llama/baby-llama.cpp \
|
|
||||||
$(OBJ_ALL)
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
@ -1467,33 +1364,18 @@ llama-server: \
|
||||||
examples/server/server.cpp \
|
examples/server/server.cpp \
|
||||||
examples/server/utils.hpp \
|
examples/server/utils.hpp \
|
||||||
examples/server/httplib.h \
|
examples/server/httplib.h \
|
||||||
examples/server/colorthemes.css.hpp \
|
|
||||||
examples/server/style.css.hpp \
|
|
||||||
examples/server/theme-beeninorder.css.hpp \
|
|
||||||
examples/server/theme-ketivah.css.hpp \
|
|
||||||
examples/server/theme-mangotango.css.hpp \
|
|
||||||
examples/server/theme-playground.css.hpp \
|
|
||||||
examples/server/theme-polarnight.css.hpp \
|
|
||||||
examples/server/theme-snowstorm.css.hpp \
|
|
||||||
examples/server/index.html.hpp \
|
examples/server/index.html.hpp \
|
||||||
examples/server/index-new.html.hpp \
|
|
||||||
examples/server/index.js.hpp \
|
|
||||||
examples/server/completion.js.hpp \
|
|
||||||
examples/server/system-prompts.js.hpp \
|
|
||||||
examples/server/prompt-formats.js.hpp \
|
|
||||||
examples/server/json-schema-to-grammar.mjs.hpp \
|
|
||||||
examples/server/loading.html.hpp \
|
examples/server/loading.html.hpp \
|
||||||
common/chat-template.hpp \
|
common/chat-template.hpp \
|
||||||
common/json.hpp \
|
common/json.hpp \
|
||||||
common/minja.hpp \
|
common/minja.hpp \
|
||||||
common/stb_image.h \
|
|
||||||
common/tool-call.h \
|
common/tool-call.h \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
||||||
examples/server/%.hpp: examples/server/public/% Makefile
|
examples/server/%.hpp: examples/server/public/% FORCE Makefile
|
||||||
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
||||||
echo "unsigned char $${NAME}[] = {" && \
|
echo "unsigned char $${NAME}[] = {" && \
|
||||||
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
||||||
|
@ -1602,11 +1484,6 @@ tests/test-minja: tests/test-minja.cpp \
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grad0: tests/test-grad0.cpp \
|
|
||||||
$(OBJ_GGML)
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
tests/test-opt: tests/test-opt.cpp \
|
tests/test-opt: tests/test-opt.cpp \
|
||||||
$(OBJ_GGML)
|
$(OBJ_GGML)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
@ -1688,7 +1565,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
||||||
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
|
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
|
||||||
#
|
#
|
||||||
# Mark legacy binary targets as .PHONY so that they are always checked.
|
# Mark legacy binary targets as .PHONY so that they are always checked.
|
||||||
.PHONY: main quantize perplexity embedding server
|
.PHONY: FORCE main quantize perplexity embedding server
|
||||||
|
|
||||||
# Define the object file target
|
# Define the object file target
|
||||||
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
|
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
|
||||||
|
|
|
@ -10,10 +10,16 @@ var sources = [
|
||||||
"src/unicode.cpp",
|
"src/unicode.cpp",
|
||||||
"src/unicode-data.cpp",
|
"src/unicode-data.cpp",
|
||||||
"ggml/src/ggml.c",
|
"ggml/src/ggml.c",
|
||||||
|
"ggml/src/ggml-aarch64.c",
|
||||||
"ggml/src/ggml-alloc.c",
|
"ggml/src/ggml-alloc.c",
|
||||||
"ggml/src/ggml-backend.cpp",
|
"ggml/src/ggml-backend.cpp",
|
||||||
|
"ggml/src/ggml-backend-reg.cpp",
|
||||||
|
"ggml/src/ggml-cpu/ggml-cpu.c",
|
||||||
|
"ggml/src/ggml-cpu/ggml-cpu.cpp",
|
||||||
|
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
|
||||||
|
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
|
||||||
|
"ggml/src/ggml-threading.cpp",
|
||||||
"ggml/src/ggml-quants.c",
|
"ggml/src/ggml-quants.c",
|
||||||
"ggml/src/ggml-aarch64.c",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
var resources: [Resource] = []
|
var resources: [Resource] = []
|
||||||
|
@ -21,21 +27,26 @@ var linkerSettings: [LinkerSetting] = []
|
||||||
var cSettings: [CSetting] = [
|
var cSettings: [CSetting] = [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
.unsafeFlags(["-fno-objc-arc"]),
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
|
.headerSearchPath("ggml/src"),
|
||||||
|
.headerSearchPath("ggml/src/ggml-cpu"),
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
// We should consider add this in the future when we drop support for iOS 14
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
// .define("ACCELERATE_NEW_LAPACK"),
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
// .define("ACCELERATE_LAPACK_ILP64")
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
|
.define("GGML_USE_CPU"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
#if canImport(Darwin)
|
#if canImport(Darwin)
|
||||||
sources.append("ggml/src/ggml-metal.m")
|
sources.append("ggml/src/ggml-common.h")
|
||||||
resources.append(.process("ggml/src/ggml-metal.metal"))
|
sources.append("ggml/src/ggml-metal/ggml-metal.m")
|
||||||
|
resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
|
||||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||||
cSettings.append(
|
cSettings.append(
|
||||||
contentsOf: [
|
contentsOf: [
|
||||||
.define("GGML_USE_ACCELERATE"),
|
.define("GGML_USE_ACCELERATE"),
|
||||||
.define("GGML_USE_METAL")
|
.define("GGML_USE_METAL"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
#endif
|
#endif
|
||||||
|
@ -60,13 +71,15 @@ let package = Package(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
path: ".",
|
path: ".",
|
||||||
exclude: [
|
exclude: [
|
||||||
|
"build",
|
||||||
"cmake",
|
"cmake",
|
||||||
"examples",
|
"examples",
|
||||||
"scripts",
|
"scripts",
|
||||||
"models",
|
"models",
|
||||||
"tests",
|
"tests",
|
||||||
"CMakeLists.txt",
|
"CMakeLists.txt",
|
||||||
"Makefile"
|
"Makefile",
|
||||||
|
"ggml/src/ggml-metal-embed.metal"
|
||||||
],
|
],
|
||||||
sources: sources,
|
sources: sources,
|
||||||
resources: resources,
|
resources: resources,
|
||||||
|
@ -75,5 +88,5 @@ let package = Package(
|
||||||
linkerSettings: linkerSettings
|
linkerSettings: linkerSettings
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
cxxLanguageStandard: .cxx11
|
cxxLanguageStandard: .cxx17
|
||||||
)
|
)
|
||||||
|
|
490
README.md
490
README.md
|
@ -4,7 +4,6 @@
|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||||
[](https://conan.io/center/llama-cpp)
|
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
|
@ -17,7 +16,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
## Hot topics
|
## Hot topics
|
||||||
|
|
||||||
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
|
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
|
||||||
|
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
|
||||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||||
|
|
||||||
----
|
----
|
||||||
|
@ -25,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||||
variety of hardware - locally and in the cloud.
|
range of hardware - locally and in the cloud.
|
||||||
|
|
||||||
- Plain C/C++ implementation without any dependencies
|
- Plain C/C++ implementation without any dependencies
|
||||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||||
|
@ -35,14 +35,17 @@ variety of hardware - locally and in the cloud.
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan and SYCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
|
||||||
improved significantly thanks to many contributions. It is the main playground for developing new features for the
|
|
||||||
[ggml](https://github.com/ggerganov/ggml) library.
|
|
||||||
|
|
||||||
**Supported models:**
|
<details>
|
||||||
|
<summary>Models</summary>
|
||||||
|
|
||||||
Typically finetunes of the base models below are supported as well.
|
Typically finetunes of the base models below are supported as well.
|
||||||
|
|
||||||
|
Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
|
||||||
|
|
||||||
|
#### Text-only
|
||||||
|
|
||||||
- [X] LLaMA 🦙
|
- [X] LLaMA 🦙
|
||||||
- [x] LLaMA 2 🦙🦙
|
- [x] LLaMA 2 🦙🦙
|
||||||
- [x] LLaMA 3 🦙🦙🦙
|
- [x] LLaMA 3 🦙🦙🦙
|
||||||
|
@ -78,6 +81,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
- [x] [OLMo](https://allenai.org/olmo)
|
- [x] [OLMo](https://allenai.org/olmo)
|
||||||
|
- [x] [OLMo 2](https://allenai.org/olmo)
|
||||||
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
|
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
|
||||||
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
|
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
|
||||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||||
|
@ -95,9 +99,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
||||||
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
#### Multimodal
|
||||||
|
|
||||||
**Multimodal models:**
|
|
||||||
|
|
||||||
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
|
||||||
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||||
|
@ -109,7 +111,10 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||||
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||||
|
|
||||||
**Bindings:**
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Bindings</summary>
|
||||||
|
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||||
|
@ -130,196 +135,145 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||||
|
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
|
||||||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
||||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||||
|
|
||||||
**UI:**
|
</details>
|
||||||
|
|
||||||
Unless otherwise noted these projects are open-source with permissive licensing:
|
<details>
|
||||||
|
<summary>UIs</summary>
|
||||||
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
|
||||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
|
||||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
|
||||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
|
||||||
- [Faraday](https://faraday.dev/) (proprietary)
|
|
||||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
|
||||||
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
|
|
||||||
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
|
||||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
|
||||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
|
||||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
|
||||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
|
||||||
- [ollama/ollama](https://github.com/ollama/ollama)
|
|
||||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
|
||||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
|
||||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
|
||||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
|
||||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
|
||||||
- [RAGNA Desktop](https://ragna.app/) (proprietary)
|
|
||||||
- [RecurseChat](https://recurse.chat/) (proprietary)
|
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
|
||||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
|
||||||
- [Msty](https://msty.app) (proprietary)
|
|
||||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
|
||||||
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
|
|
||||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
|
||||||
- [MindMac](https://mindmac.app) (proprietary)
|
|
||||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
|
||||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
|
||||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
|
||||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
|
||||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
|
||||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
|
||||||
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
**Tools:**
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
|
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||||
|
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||||
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
|
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
|
||||||
|
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||||
|
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
|
||||||
|
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||||
|
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
|
||||||
|
- [LARS](https://github.com/abgulati/LARS) (AGPL)
|
||||||
|
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
||||||
|
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||||
|
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||||
|
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||||
|
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||||
|
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||||
|
- [MindMac](https://mindmac.app) (proprietary)
|
||||||
|
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||||
|
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||||
|
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
|
||||||
|
- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
|
||||||
|
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
|
||||||
|
- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
|
||||||
|
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
||||||
|
- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
||||||
|
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
|
||||||
|
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
|
||||||
|
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||||
|
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
||||||
|
- [semperai/amica](https://github.com/semperai/amica) (MIT)
|
||||||
|
- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Tools</summary>
|
||||||
|
|
||||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
|
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
|
||||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
|
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
|
||||||
|
|
||||||
**Infrastructure:**
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Infrastructure</summary>
|
||||||
|
|
||||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||||
|
|
||||||
**Games:**
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Games</summary>
|
||||||
|
|
||||||
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
||||||
|
|
||||||
## Demo
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
|
|
||||||
|
|
||||||
```
|
|
||||||
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
|
||||||
I llama.cpp build info:
|
|
||||||
I UNAME_S: Darwin
|
|
||||||
I UNAME_P: arm
|
|
||||||
I UNAME_M: arm64
|
|
||||||
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
|
|
||||||
I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
|
|
||||||
I LDFLAGS: -framework Accelerate
|
|
||||||
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
|
||||||
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
|
||||||
|
|
||||||
make: Nothing to be done for `default'.
|
|
||||||
main: build = 1041 (cf658ad)
|
|
||||||
main: seed = 1692823051
|
|
||||||
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
|
||||||
llama_model_loader: - type f32: 81 tensors
|
|
||||||
llama_model_loader: - type q4_0: 281 tensors
|
|
||||||
llama_model_loader: - type q6_K: 1 tensors
|
|
||||||
llm_load_print_meta: format = GGUF V1 (latest)
|
|
||||||
llm_load_print_meta: arch = llama
|
|
||||||
llm_load_print_meta: vocab type = SPM
|
|
||||||
llm_load_print_meta: n_vocab = 32000
|
|
||||||
llm_load_print_meta: n_merges = 0
|
|
||||||
llm_load_print_meta: n_ctx_train = 4096
|
|
||||||
llm_load_print_meta: n_ctx = 512
|
|
||||||
llm_load_print_meta: n_embd = 5120
|
|
||||||
llm_load_print_meta: n_head = 40
|
|
||||||
llm_load_print_meta: n_head_kv = 40
|
|
||||||
llm_load_print_meta: n_layer = 40
|
|
||||||
llm_load_print_meta: n_rot = 128
|
|
||||||
llm_load_print_meta: n_gqa = 1
|
|
||||||
llm_load_print_meta: f_norm_eps = 1.0e-05
|
|
||||||
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
|
|
||||||
llm_load_print_meta: n_ff = 13824
|
|
||||||
llm_load_print_meta: freq_base = 10000.0
|
|
||||||
llm_load_print_meta: freq_scale = 1
|
|
||||||
llm_load_print_meta: model type = 13B
|
|
||||||
llm_load_print_meta: model ftype = mostly Q4_0
|
|
||||||
llm_load_print_meta: model size = 13.02 B
|
|
||||||
llm_load_print_meta: general.name = LLaMA v2
|
|
||||||
llm_load_print_meta: BOS token = 1 '<s>'
|
|
||||||
llm_load_print_meta: EOS token = 2 '</s>'
|
|
||||||
llm_load_print_meta: UNK token = 0 '<unk>'
|
|
||||||
llm_load_print_meta: LF token = 13 '<0x0A>'
|
|
||||||
llm_load_tensors: ggml ctx size = 0.11 MB
|
|
||||||
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
|
|
||||||
...................................................................................................
|
|
||||||
llama_new_context_with_model: kv self size = 400.00 MB
|
|
||||||
llama_new_context_with_model: compute buffer total size = 75.41 MB
|
|
||||||
|
|
||||||
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
|
||||||
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
|
||||||
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
|
|
||||||
|
|
||||||
|
|
||||||
Building a website can be done in 10 simple steps:
|
|
||||||
Step 1: Find the right website platform.
|
|
||||||
Step 2: Choose your domain name and hosting plan.
|
|
||||||
Step 3: Design your website layout.
|
|
||||||
Step 4: Write your website content and add images.
|
|
||||||
Step 5: Install security features to protect your site from hackers or spammers
|
|
||||||
Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
|
|
||||||
Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
|
|
||||||
Step 8: Start marketing and promoting the website via social media channels or paid ads
|
|
||||||
Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
|
|
||||||
Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
|
|
||||||
How does a Website Work?
|
|
||||||
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
|
|
||||||
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
|
|
||||||
How to
|
|
||||||
llama_print_timings: load time = 576.45 ms
|
|
||||||
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
|
||||||
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
|
||||||
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
|
||||||
llama_print_timings: total time = 25431.49 ms
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
## Supported backends
|
||||||
<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
|
|
||||||
|
|
||||||
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
|
| Backend | Target devices |
|
||||||
|
| --- | --- |
|
||||||
|
| [Metal](docs/build.md#metal-build) | Apple Silicon |
|
||||||
|
| [BLAS](docs/build.md#blas-build) | All |
|
||||||
|
| [BLIS](docs/backend/BLIS.md) | All |
|
||||||
|
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||||
|
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
|
||||||
|
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||||
|
| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
|
||||||
|
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||||
|
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
|
## Building the project
|
||||||
|
|
||||||
</details>
|
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
||||||
|
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
|
||||||
|
|
||||||
## Usage
|
- Clone this repository and build locally, see [how to build](docs/build.md)
|
||||||
|
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
|
||||||
|
- Use a Docker image, see [documentation for Docker](docs/docker.md)
|
||||||
|
- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
||||||
|
|
||||||
Here are the end-to-end binary build and model conversion steps for most supported models.
|
## Obtaining and quantizing models
|
||||||
|
|
||||||
### Basic usage
|
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
|
||||||
|
|
||||||
Firstly, you need to get the binary. There are different methods that you can follow:
|
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
||||||
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
|
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
||||||
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
|
|
||||||
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
|
|
||||||
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
|
||||||
|
|
||||||
You can run a basic completion using this command:
|
After downloading a model, use the CLI tools to run it locally - see below.
|
||||||
|
|
||||||
|
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
|
||||||
|
|
||||||
|
The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
|
||||||
|
|
||||||
|
- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
|
||||||
|
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
|
||||||
|
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
|
||||||
|
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
|
||||||
|
|
||||||
|
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
|
||||||
|
|
||||||
|
## [`llama-cli`](examples/main)
|
||||||
|
|
||||||
|
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
||||||
|
|
||||||
|
- <details open>
|
||||||
|
<summary>Run simple text completion</summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
|
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128
|
||||||
|
|
||||||
# Output:
|
|
||||||
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
||||||
```
|
```
|
||||||
|
|
||||||
See [this page](./examples/main/README.md) for a full list of parameters.
|
</details>
|
||||||
|
|
||||||
### Conversation mode
|
- <details>
|
||||||
|
<summary>Run in conversation mode</summary>
|
||||||
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv
|
||||||
|
|
||||||
# Output:
|
|
||||||
# > hi, who are you?
|
# > hi, who are you?
|
||||||
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
|
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
|
||||||
#
|
#
|
||||||
|
@ -327,124 +281,174 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
||||||
# Easy peasy! The answer to 1+1 is... 2!
|
# Easy peasy! The answer to 1+1 is... 2!
|
||||||
```
|
```
|
||||||
|
|
||||||
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
</details>
|
||||||
|
|
||||||
|
- <details>
|
||||||
|
<summary>Run with custom chat template</summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
# use the "chatml" template
|
||||||
|
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
||||||
|
|
||||||
|
# use a custom template
|
||||||
|
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
|
[Supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
- <details>
|
||||||
|
<summary>Constrain the output with a custom grammar</summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
|
|
||||||
|
# {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Web server
|
The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
|
||||||
|
|
||||||
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
|
||||||
|
|
||||||
Example usage:
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
## [`llama-server`](examples/server)
|
||||||
|
|
||||||
|
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
|
||||||
|
|
||||||
|
- <details open>
|
||||||
|
<summary>Start a local HTTP server with default configuration on port 8080</summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-server -m your_model.gguf --port 8080
|
llama-server -m model.gguf --port 8080
|
||||||
|
|
||||||
# Basic web UI can be accessed via browser: http://localhost:8080
|
# Basic web UI can be accessed via browser: http://localhost:8080
|
||||||
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
||||||
```
|
```
|
||||||
|
|
||||||
### Interactive mode
|
</details>
|
||||||
|
|
||||||
> [!NOTE]
|
- <details>
|
||||||
> If you prefer basic usage, please consider using conversation mode instead of interactive mode
|
<summary>Support multiple-users and parallel decoding</summary>
|
||||||
|
|
||||||
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
|
|
||||||
|
|
||||||
Here is an example of a few-shot interaction, invoked with the command
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# default arguments using a 7B model
|
# up to 4 concurrent requests, each with 4096 max context
|
||||||
./examples/chat.sh
|
llama-server -m model.gguf -c 16384 -np 4
|
||||||
|
|
||||||
# advanced chat with a 13B model
|
|
||||||
./examples/chat-13B.sh
|
|
||||||
|
|
||||||
# custom arguments using a 13B model
|
|
||||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
</details>
|
||||||
|
|
||||||

|
- <details>
|
||||||
|
<summary>Enable speculative decoding</summary>
|
||||||
### Persistent Interaction
|
|
||||||
|
|
||||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start a new chat
|
# the draft.gguf model should be a small variant of the target model.gguf
|
||||||
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
|
llama-server -m model.gguf -md draft.gguf
|
||||||
|
|
||||||
# Resume that chat
|
|
||||||
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
|
|
||||||
|
|
||||||
# Start a different chat with the same prompt/model
|
|
||||||
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
|
|
||||||
|
|
||||||
# Different prompt cache for different prompt/model
|
|
||||||
PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
|
||||||
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Constrained output with grammars
|
</details>
|
||||||
|
|
||||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
- <details>
|
||||||
|
<summary>Serve an embedding model</summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
# use the /embedding endpoint
|
||||||
|
llama-server -m model.gguf --embedding --pooling cls -ub 8192
|
||||||
```
|
```
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
</details>
|
||||||
|
|
||||||
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
- <details>
|
||||||
|
<summary>Serve a reranking model</summary>
|
||||||
|
|
||||||
## Build
|
```bash
|
||||||
|
# use the /reranking endpoint
|
||||||
|
llama-server -m model.gguf --reranking
|
||||||
|
```
|
||||||
|
|
||||||
Please refer to [Build llama.cpp locally](./docs/build.md)
|
</details>
|
||||||
|
|
||||||
## Supported backends
|
- <details>
|
||||||
|
<summary>Constrain all outputs with a grammar</summary>
|
||||||
|
|
||||||
| Backend | Target devices |
|
```bash
|
||||||
| --- | --- |
|
# custom grammar
|
||||||
| [Metal](./docs/build.md#metal-build) | Apple Silicon |
|
llama-server -m model.gguf --grammar-file grammar.gbnf
|
||||||
| [BLAS](./docs/build.md#blas-build) | All |
|
|
||||||
| [BLIS](./docs/backend/BLIS.md) | All |
|
|
||||||
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
|
||||||
| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
|
|
||||||
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
|
|
||||||
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
|
|
||||||
| [Vulkan](./docs/build.md#vulkan) | GPU |
|
|
||||||
| [CANN](./docs/build.md#cann) | Ascend NPU |
|
|
||||||
|
|
||||||
## Tools
|
# JSON
|
||||||
|
llama-server -m model.gguf --grammar-file grammars/json.gbnf
|
||||||
|
```
|
||||||
|
|
||||||
### Prepare and Quantize
|
</details>
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
|
||||||
|
|
||||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
## [`llama-perplexity`](examples/perplexity)
|
||||||
|
|
||||||
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
|
||||||
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
|
||||||
|
|
||||||
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
|
- <details open>
|
||||||
|
<summary>Measure the perplexity over a text file</summary>
|
||||||
|
|
||||||
### Perplexity (measuring model quality)
|
```bash
|
||||||
|
llama-perplexity -m model.gguf -f file.txt
|
||||||
|
|
||||||
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
|
# [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
|
||||||
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
|
# Final estimate: PPL = 5.4007 +/- 0.67339
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
- <details>
|
||||||
|
<summary>Measure KL divergence</summary>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# TODO
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
|
||||||
|
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
||||||
|
|
||||||
|
## [`llama-bench`](example/bench)
|
||||||
|
|
||||||
|
#### Benchmark the performance of the inference for various parameters.
|
||||||
|
|
||||||
|
- <details open>
|
||||||
|
<summary>Run default benchmark</summary>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-bench -m model.gguf
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# | model | size | params | backend | threads | test | t/s |
|
||||||
|
# | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
|
||||||
|
# | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | pp512 | 5765.41 ± 20.55 |
|
||||||
|
# | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | tg128 | 197.71 ± 0.81 |
|
||||||
|
#
|
||||||
|
# build: 3e0ba0e60 (4229)
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
## [`llama-simple`](examples/simple)
|
||||||
|
|
||||||
|
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
|
||||||
|
|
||||||
|
- <details>
|
||||||
|
<summary>Basic text completion</summary>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-simple -m model.gguf
|
||||||
|
|
||||||
|
# Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
@ -457,22 +461,21 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
|
||||||
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
||||||
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
||||||
|
|
||||||
## Other documentations
|
## Other documentation
|
||||||
|
|
||||||
- [main (cli)](./examples/main/README.md)
|
- [main (cli)](examples/main/README.md)
|
||||||
- [server](./examples/server/README.md)
|
- [server](examples/server/README.md)
|
||||||
- [jeopardy](./examples/jeopardy/README.md)
|
- [GBNF grammars](grammars/README.md)
|
||||||
- [GBNF grammars](./grammars/README.md)
|
|
||||||
|
|
||||||
**Development documentations**
|
#### Development documentation
|
||||||
|
|
||||||
- [How to build](./docs/build.md)
|
- [How to build](docs/build.md)
|
||||||
- [Running on Docker](./docs/docker.md)
|
- [Running on Docker](docs/docker.md)
|
||||||
- [Build on Android](./docs/android.md)
|
- [Build on Android](docs/android.md)
|
||||||
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
|
- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
|
||||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||||
|
|
||||||
**Seminal papers and background on the models**
|
#### Seminal papers and background on the models
|
||||||
|
|
||||||
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
|
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
|
||||||
- LLaMA:
|
- LLaMA:
|
||||||
|
@ -483,3 +486,6 @@ If your issue is with model generation quality, then please at least scan the fo
|
||||||
- GPT-3.5 / InstructGPT / ChatGPT:
|
- GPT-3.5 / InstructGPT / ChatGPT:
|
||||||
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
||||||
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
||||||
|
|
||||||
|
#### References
|
||||||
|
|
||||||
|
|
171
ci/run.sh
171
ci/run.sh
|
@ -39,7 +39,7 @@ SRC=`pwd`
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
|
@ -326,36 +326,36 @@ function gg_run_open_llama_7b_v2 {
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -460,34 +460,34 @@ function gg_run_pythia_1_4b {
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -591,36 +591,36 @@ function gg_run_pythia_2_8b {
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -706,8 +706,8 @@ function gg_run_embd_bge_small {
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
# for this model, the SEP token is "</s>"
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||||
|
|
||||||
# sample output
|
# sample output
|
||||||
# rerank score 0: 0.029
|
# rerank score 0: 0.029
|
||||||
|
@ -815,7 +815,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
# Create a fresh python3 venv and enter it
|
# Create a fresh python3 venv and enter it
|
||||||
python3 -m venv "$MNT/venv"
|
if ! python3 -m venv "$MNT/venv"; then
|
||||||
|
echo "Error: Failed to create Python virtual environment at $MNT/venv."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
source "$MNT/venv/bin/activate"
|
source "$MNT/venv/bin/activate"
|
||||||
|
|
||||||
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
||||||
|
|
16
cmake/arm64-apple-clang.cmake
Normal file
16
cmake/arm64-apple-clang.cmake
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
set( CMAKE_SYSTEM_NAME Darwin )
|
||||||
|
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||||
|
|
||||||
|
set( target arm64-apple-darwin-macho )
|
||||||
|
|
||||||
|
set( CMAKE_C_COMPILER clang )
|
||||||
|
set( CMAKE_CXX_COMPILER clang++ )
|
||||||
|
|
||||||
|
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||||
|
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||||
|
|
||||||
|
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||||
|
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
||||||
|
|
||||||
|
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||||
|
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
33
cmake/common.cmake
Normal file
33
cmake/common.cmake
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
function(llama_add_compile_flags)
|
||||||
|
if (LLAMA_FATAL_WARNINGS)
|
||||||
|
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||||
|
list(APPEND C_FLAGS -Werror)
|
||||||
|
list(APPEND CXX_FLAGS -Werror)
|
||||||
|
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||||
|
add_compile_options(/WX)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_ALL_WARNINGS)
|
||||||
|
if (NOT MSVC)
|
||||||
|
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||||
|
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||||
|
|
||||||
|
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||||
|
|
||||||
|
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||||
|
|
||||||
|
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
||||||
|
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
||||||
|
|
||||||
|
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
||||||
|
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
||||||
|
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
||||||
|
else()
|
||||||
|
# todo : msvc
|
||||||
|
set(C_FLAGS "" PARENT_SCOPE)
|
||||||
|
set(CXX_FLAGS "" PARENT_SCOPE)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endfunction()
|
|
@ -3,18 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
|
|
||||||
set(GGML_BLAS @GGML_BLAS@)
|
set(GGML_STATIC @GGML_STATIC@)
|
||||||
set(GGML_CUDA @GGML_CUDA@)
|
set(GGML_NATIVE @GGML_NATIVE@)
|
||||||
set(GGML_METAL @GGML_METAL@)
|
set(GGML_LTO @GGML_LTO@)
|
||||||
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
set(GGML_CCACHE @GGML_CCACHE@)
|
||||||
|
set(GGML_AVX @GGML_AVX@)
|
||||||
|
set(GGML_AVX2 @GGML_AVX2@)
|
||||||
|
set(GGML_AVX512 @GGML_AVX512@)
|
||||||
|
set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
|
||||||
|
set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
|
||||||
|
set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
|
||||||
|
set(GGML_AMX_TILE @GGML_AMX_TILE@)
|
||||||
|
set(GGML_AMX_INT8 @GGML_AMX_INT8@)
|
||||||
|
set(GGML_AMX_BF16 @GGML_AMX_BF16@)
|
||||||
|
set(GGML_FMA @GGML_FMA@)
|
||||||
|
set(GGML_LASX @GGML_LASX@)
|
||||||
|
set(GGML_LSX @GGML_LSX@)
|
||||||
|
set(GGML_RVV @GGML_RVV@)
|
||||||
|
set(GGML_SVE @GGML_SVE@)
|
||||||
|
|
||||||
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||||
set(GGML_VULKAN @GGML_VULKAN@)
|
set(GGML_OPENMP @GGML_OPENMP@)
|
||||||
|
set(GGML_CPU_HBM @GGML_CPU_HBM@)
|
||||||
|
set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
|
||||||
|
|
||||||
|
set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@)
|
||||||
|
set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
|
||||||
|
set(GGML_CUDA_F16 @GGML_CUDA_F16@)
|
||||||
|
set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
|
||||||
|
set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@)
|
||||||
|
set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@)
|
||||||
|
set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
|
||||||
|
set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@)
|
||||||
|
|
||||||
|
set(GGML_HIP_UMA @GGML_HIP_UMA@)
|
||||||
|
|
||||||
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
|
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
|
||||||
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
|
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
|
||||||
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
|
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
|
||||||
|
set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
|
||||||
|
set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@)
|
||||||
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
|
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
|
||||||
set(GGML_SYCL @GGML_SYCL@)
|
set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
|
||||||
set(GGML_OPENMP @GGML_OPENMP@)
|
|
||||||
|
set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
|
||||||
|
set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@)
|
||||||
|
set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@)
|
||||||
|
set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
|
||||||
|
set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
|
||||||
|
set(GGML_METAL_STD @GGML_METAL_STD@)
|
||||||
|
|
||||||
|
set(GGML_SYCL_F16 @GGML_SYCL_F16@)
|
||||||
|
set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
|
||||||
|
set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
|
||||||
|
|
||||||
|
|
||||||
@PACKAGE_INIT@
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
|
@ -22,16 +64,60 @@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||||
|
|
||||||
# Ensure transient dependencies satisfied
|
|
||||||
|
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
|
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
||||||
|
set(_llama_link_deps "")
|
||||||
|
set(_llama_link_opts "")
|
||||||
|
foreach(_ggml_lib ggml ggml-base)
|
||||||
|
string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
|
||||||
|
find_library(${_ggml_lib_var} ${_ggml_lib}
|
||||||
|
REQUIRED
|
||||||
|
HINTS ${LLAMA_LIB_DIR}
|
||||||
|
NO_CMAKE_FIND_ROOT_PATH
|
||||||
|
)
|
||||||
|
list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
|
||||||
|
message(STATUS "Found ${${_ggml_lib_var}}")
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
|
||||||
|
string(TOUPPER "GGML_${backend}" backend_id)
|
||||||
|
set(_ggml_lib "ggml-${backend}")
|
||||||
|
string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
|
||||||
|
|
||||||
|
find_library(${_ggml_lib_var} ${_ggml_lib}
|
||||||
|
HINTS ${LLAMA_LIB_DIR}
|
||||||
|
NO_CMAKE_FIND_ROOT_PATH
|
||||||
|
)
|
||||||
|
if(${_ggml_lib_var})
|
||||||
|
list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
|
||||||
|
set(${backend_id} ON)
|
||||||
|
message(STATUS "Found backend ${${_ggml_lib_var}}")
|
||||||
|
else()
|
||||||
|
set(${backend_id} OFF)
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
if (NOT LLAMA_SHARED_LIB)
|
||||||
if (APPLE AND GGML_ACCELERATE)
|
if (APPLE AND GGML_ACCELERATE)
|
||||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||||
|
list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_OPENMP)
|
||||||
|
find_package(OpenMP REQUIRED)
|
||||||
|
list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CPU_HBM)
|
||||||
|
find_library(memkind memkind REQUIRED)
|
||||||
|
list(APPEND _llama_link_deps memkind)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_BLAS)
|
if (GGML_BLAS)
|
||||||
find_package(BLAS REQUIRED)
|
find_package(BLAS REQUIRED)
|
||||||
|
list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
|
||||||
|
list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_CUDA)
|
if (GGML_CUDA)
|
||||||
|
@ -42,45 +128,47 @@ if (GGML_METAL)
|
||||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
|
||||||
|
${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_VULKAN)
|
if (GGML_VULKAN)
|
||||||
find_package(Vulkan REQUIRED)
|
find_package(Vulkan REQUIRED)
|
||||||
|
list(APPEND _llama_link_deps Vulkan::Vulkan)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_HIPBLAS)
|
if (GGML_HIP)
|
||||||
find_package(hip REQUIRED)
|
find_package(hip REQUIRED)
|
||||||
find_package(hipblas REQUIRED)
|
find_package(hipblas REQUIRED)
|
||||||
find_package(rocblas REQUIRED)
|
find_package(rocblas REQUIRED)
|
||||||
|
list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_SYCL)
|
if (GGML_SYCL)
|
||||||
|
find_package(DNNL)
|
||||||
|
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||||
|
list(APPEND _llama_link_deps DNNL::dnnl)
|
||||||
|
endif()
|
||||||
|
if (WIN32)
|
||||||
find_package(IntelSYCL REQUIRED)
|
find_package(IntelSYCL REQUIRED)
|
||||||
find_package(MKL REQUIRED)
|
find_package(MKL REQUIRED)
|
||||||
|
list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_OPENMP)
|
|
||||||
find_package(OpenMP REQUIRED)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
find_library(ggml_LIBRARY ggml
|
|
||||||
REQUIRED
|
|
||||||
HINTS ${LLAMA_LIB_DIR})
|
|
||||||
|
|
||||||
find_library(llama_LIBRARY llama
|
find_library(llama_LIBRARY llama
|
||||||
REQUIRED
|
REQUIRED
|
||||||
HINTS ${LLAMA_LIB_DIR})
|
HINTS ${LLAMA_LIB_DIR}
|
||||||
|
NO_CMAKE_FIND_ROOT_PATH
|
||||||
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
|
)
|
||||||
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
|
||||||
|
|
||||||
add_library(llama UNKNOWN IMPORTED)
|
add_library(llama UNKNOWN IMPORTED)
|
||||||
|
|
||||||
set_target_properties(llama
|
set_target_properties(llama
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||||
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
||||||
|
INTERFACE_LINK_OPTIONS "${_llama_link_opts}"
|
||||||
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
|
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
|
||||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
|
llama_add_compile_flags()
|
||||||
|
|
||||||
# Build info header
|
# Build info header
|
||||||
#
|
#
|
||||||
|
|
||||||
|
@ -68,9 +70,9 @@ add_library(${TARGET} STATIC
|
||||||
ngram-cache.h
|
ngram-cache.h
|
||||||
sampling.cpp
|
sampling.cpp
|
||||||
sampling.h
|
sampling.h
|
||||||
|
speculative.cpp
|
||||||
|
speculative.h
|
||||||
tool-call.cpp
|
tool-call.cpp
|
||||||
train.cpp
|
|
||||||
train.h
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
@ -89,5 +91,5 @@ if (LLAMA_CURL)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
|
535
common/arg.cpp
535
common/arg.cpp
|
@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
|
||||||
}
|
}
|
||||||
params.hf_file = params.model;
|
params.hf_file = params.model;
|
||||||
} else if (params.model.empty()) {
|
} else if (params.model.empty()) {
|
||||||
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
|
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||||
|
std::string filename = params.hf_repo + "_" + params.hf_file;
|
||||||
|
// to make sure we don't have any slashes in the filename
|
||||||
|
string_replace_all(filename, "/", "_");
|
||||||
|
params.model = fs_get_cache_file(filename);
|
||||||
}
|
}
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
if (params.model.empty()) {
|
if (params.model.empty()) {
|
||||||
|
@ -235,8 +239,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
|
|
||||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||||
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||||
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
|
||||||
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams);
|
||||||
|
postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch);
|
||||||
|
|
||||||
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
||||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||||
|
@ -251,7 +256,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
for (auto & antiprompt : params.antiprompt) {
|
for (auto & antiprompt : params.antiprompt) {
|
||||||
string_process_escapes(antiprompt);
|
string_process_escapes(antiprompt);
|
||||||
}
|
}
|
||||||
for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
|
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
||||||
string_process_escapes(seq_breaker);
|
string_process_escapes(seq_breaker);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -297,6 +302,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
|
||||||
print_options(specific_options);
|
print_options(specific_options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
|
||||||
|
std::vector<ggml_backend_dev_t> devices;
|
||||||
|
auto dev_names = string_split<std::string>(value, ',');
|
||||||
|
if (dev_names.empty()) {
|
||||||
|
throw std::invalid_argument("no devices specified");
|
||||||
|
}
|
||||||
|
if (dev_names.size() == 1 && dev_names[0] == "none") {
|
||||||
|
devices.push_back(nullptr);
|
||||||
|
} else {
|
||||||
|
for (const auto & device : dev_names) {
|
||||||
|
auto * dev = ggml_backend_dev_by_name(device.c_str());
|
||||||
|
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||||
|
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
|
||||||
|
}
|
||||||
|
devices.push_back(dev);
|
||||||
|
}
|
||||||
|
devices.push_back(nullptr);
|
||||||
|
}
|
||||||
|
return devices;
|
||||||
|
}
|
||||||
|
|
||||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||||
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
||||||
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
||||||
|
@ -322,14 +348,29 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string list_builtin_chat_templates() {
|
||||||
|
std::vector<const char *> supported_tmpl;
|
||||||
|
int32_t res = llama_chat_builtin_templates(nullptr, 0);
|
||||||
|
supported_tmpl.resize(res);
|
||||||
|
res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
|
||||||
|
std::ostringstream msg;
|
||||||
|
for (auto & tmpl : supported_tmpl) {
|
||||||
|
msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
|
||||||
|
}
|
||||||
|
return msg.str();
|
||||||
|
}
|
||||||
|
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||||
|
// load dynamic backends
|
||||||
|
ggml_backend_load_all();
|
||||||
|
|
||||||
common_params_context ctx_arg(params);
|
common_params_context ctx_arg(params);
|
||||||
ctx_arg.print_usage = print_usage;
|
ctx_arg.print_usage = print_usage;
|
||||||
ctx_arg.ex = ex;
|
ctx_arg.ex = ex;
|
||||||
|
|
||||||
std::string sampler_type_chars;
|
std::string sampler_type_chars;
|
||||||
std::string sampler_type_names;
|
std::string sampler_type_names;
|
||||||
for (const auto & sampler : params.sparams.samplers) {
|
for (const auto & sampler : params.sampling.samplers) {
|
||||||
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
||||||
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
||||||
}
|
}
|
||||||
|
@ -407,26 +448,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-td", "--threads-draft"}, "N",
|
|
||||||
"number of threads to use during generation (default: same as --threads)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams.n_threads = value;
|
|
||||||
if (params.draft_cpuparams.n_threads <= 0) {
|
|
||||||
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-tbd", "--threads-batch-draft"}, "N",
|
|
||||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams_batch.n_threads = value;
|
|
||||||
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
|
||||||
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-C", "--cpu-mask"}, "M",
|
{"-C", "--cpu-mask"}, "M",
|
||||||
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
||||||
|
@ -515,108 +536,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.cpuparams_batch.poll = value;
|
params.cpuparams_batch.poll = value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-Cd", "--cpu-mask-draft"}, "M",
|
|
||||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
||||||
[](common_params & params, const std::string & mask) {
|
|
||||||
params.draft_cpuparams.mask_valid = true;
|
|
||||||
if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
|
|
||||||
throw std::invalid_argument("invalid cpumask");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
|
||||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
|
||||||
[](common_params & params, const std::string & range) {
|
|
||||||
params.draft_cpuparams.mask_valid = true;
|
|
||||||
if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
|
|
||||||
throw std::invalid_argument("invalid range");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--cpu-strict-draft"}, "<0|1>",
|
|
||||||
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams.strict_cpu = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--prio-draft"}, "N",
|
|
||||||
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
|
||||||
[](common_params & params, int prio) {
|
|
||||||
if (prio < 0 || prio > 3) {
|
|
||||||
throw std::invalid_argument("invalid value");
|
|
||||||
}
|
|
||||||
params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--poll-draft"}, "<0|1>",
|
|
||||||
"Use polling to wait for draft model work (default: same as --poll])",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams.poll = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
|
||||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
||||||
[](common_params & params, const std::string & mask) {
|
|
||||||
params.draft_cpuparams_batch.mask_valid = true;
|
|
||||||
if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
|
|
||||||
throw std::invalid_argument("invalid cpumask");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
|
||||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
|
||||||
[](common_params & params, const std::string & range) {
|
|
||||||
params.draft_cpuparams_batch.mask_valid = true;
|
|
||||||
if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
|
|
||||||
throw std::invalid_argument("invalid cpumask");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--cpu-strict-batch-draft"}, "<0|1>",
|
|
||||||
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams_batch.strict_cpu = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--prio-batch-draft"}, "N",
|
|
||||||
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
|
||||||
[](common_params & params, int prio) {
|
|
||||||
if (prio < 0 || prio > 3) {
|
|
||||||
throw std::invalid_argument("invalid value");
|
|
||||||
}
|
|
||||||
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--poll-batch-draft"}, "<0|1>",
|
|
||||||
"Use polling to wait for draft model work (default: --poll-draft)",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.draft_cpuparams_batch.poll = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--draft"}, "N",
|
|
||||||
string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.n_draft = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-ps", "--p-split"}, "N",
|
|
||||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
|
||||||
[](common_params & params, const std::string & value) {
|
|
||||||
params.p_split = std::stof(value);
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
||||||
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
||||||
|
@ -701,7 +620,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.no_perf = true;
|
params.no_perf = true;
|
||||||
params.sparams.no_perf = true;
|
params.sampling.no_perf = true;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_PERF"));
|
).set_env("LLAMA_ARG_NO_PERF"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
@ -883,155 +802,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
const auto sampler_names = string_split<std::string>(value, ';');
|
const auto sampler_names = string_split<std::string>(value, ';');
|
||||||
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-s", "--seed"}, "SEED",
|
{"-s", "--seed"}, "SEED",
|
||||||
string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
|
string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.seed = std::stoul(value);
|
params.sampling.seed = std::stoul(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--sampling-seq"}, "SEQUENCE",
|
{"--sampling-seq"}, "SEQUENCE",
|
||||||
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.samplers = common_sampler_types_from_chars(value);
|
params.sampling.samplers = common_sampler_types_from_chars(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--ignore-eos"},
|
{"--ignore-eos"},
|
||||||
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.sparams.ignore_eos = true;
|
params.sampling.ignore_eos = true;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--penalize-nl"},
|
{"--penalize-nl"},
|
||||||
string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.sparams.penalize_nl = true;
|
params.sampling.penalize_nl = true;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--temp"}, "N",
|
{"--temp"}, "N",
|
||||||
string_format("temperature (default: %.1f)", (double)params.sparams.temp),
|
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.temp = std::stof(value);
|
params.sampling.temp = std::stof(value);
|
||||||
params.sparams.temp = std::max(params.sparams.temp, 0.0f);
|
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--top-k"}, "N",
|
{"--top-k"}, "N",
|
||||||
string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
|
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.top_k = value;
|
params.sampling.top_k = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--top-p"}, "N",
|
{"--top-p"}, "N",
|
||||||
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
|
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.top_p = std::stof(value);
|
params.sampling.top_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--min-p"}, "N",
|
{"--min-p"}, "N",
|
||||||
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
|
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.min_p = std::stof(value);
|
params.sampling.min_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--xtc-probability"}, "N",
|
{"--xtc-probability"}, "N",
|
||||||
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.xtc_probability = std::stof(value);
|
params.sampling.xtc_probability = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--xtc-threshold"}, "N",
|
{"--xtc-threshold"}, "N",
|
||||||
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
|
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.xtc_threshold = std::stof(value);
|
params.sampling.xtc_threshold = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--typical"}, "N",
|
{"--typical"}, "N",
|
||||||
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.typ_p = std::stof(value);
|
params.sampling.typ_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--repeat-last-n"}, "N",
|
{"--repeat-last-n"}, "N",
|
||||||
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
|
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.penalty_last_n = value;
|
params.sampling.penalty_last_n = value;
|
||||||
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
|
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--repeat-penalty"}, "N",
|
{"--repeat-penalty"}, "N",
|
||||||
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
|
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_repeat = std::stof(value);
|
params.sampling.penalty_repeat = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--presence-penalty"}, "N",
|
{"--presence-penalty"}, "N",
|
||||||
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
|
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_present = std::stof(value);
|
params.sampling.penalty_present = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--frequency-penalty"}, "N",
|
{"--frequency-penalty"}, "N",
|
||||||
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
|
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_freq = std::stof(value);
|
params.sampling.penalty_freq = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-multiplier"}, "N",
|
{"--dry-multiplier"}, "N",
|
||||||
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
|
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.dry_multiplier = std::stof(value);
|
params.sampling.dry_multiplier = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-base"}, "N",
|
{"--dry-base"}, "N",
|
||||||
string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
|
string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
float potential_base = std::stof(value);
|
float potential_base = std::stof(value);
|
||||||
if (potential_base >= 1.0f)
|
if (potential_base >= 1.0f)
|
||||||
{
|
{
|
||||||
params.sparams.dry_base = potential_base;
|
params.sampling.dry_base = potential_base;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-allowed-length"}, "N",
|
{"--dry-allowed-length"}, "N",
|
||||||
string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
|
string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.dry_allowed_length = value;
|
params.sampling.dry_allowed_length = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-penalty-last-n"}, "N",
|
{"--dry-penalty-last-n"}, "N",
|
||||||
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
|
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.dry_penalty_last_n = value;
|
params.sampling.dry_penalty_last_n = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dry-sequence-breaker"}, "STRING",
|
{"--dry-sequence-breaker"}, "STRING",
|
||||||
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
|
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
|
||||||
params.sparams.dry_sequence_breakers.empty() ? "none" :
|
params.sampling.dry_sequence_breakers.empty() ? "none" :
|
||||||
std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
|
std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
|
||||||
params.sparams.dry_sequence_breakers.end(),
|
params.sampling.dry_sequence_breakers.end(),
|
||||||
std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
|
std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
|
||||||
[](const std::string& a, const std::string& b) {
|
[](const std::string& a, const std::string& b) {
|
||||||
std::string formatted_b = (b == "\n") ? "\\n" : b;
|
std::string formatted_b = (b == "\n") ? "\\n" : b;
|
||||||
return a + ", '" + formatted_b + "'";
|
return a + ", '" + formatted_b + "'";
|
||||||
|
@ -1040,51 +959,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
static bool defaults_cleared = false;
|
static bool defaults_cleared = false;
|
||||||
|
|
||||||
if (!defaults_cleared) {
|
if (!defaults_cleared) {
|
||||||
params.sparams.dry_sequence_breakers.clear();
|
params.sampling.dry_sequence_breakers.clear();
|
||||||
defaults_cleared = true;
|
defaults_cleared = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (value == "none") {
|
if (value == "none") {
|
||||||
params.sparams.dry_sequence_breakers.clear();
|
params.sampling.dry_sequence_breakers.clear();
|
||||||
} else {
|
} else {
|
||||||
params.sparams.dry_sequence_breakers.emplace_back(value);
|
params.sampling.dry_sequence_breakers.emplace_back(value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dynatemp-range"}, "N",
|
{"--dynatemp-range"}, "N",
|
||||||
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
|
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.dynatemp_range = std::stof(value);
|
params.sampling.dynatemp_range = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dynatemp-exp"}, "N",
|
{"--dynatemp-exp"}, "N",
|
||||||
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
|
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.dynatemp_exponent = std::stof(value);
|
params.sampling.dynatemp_exponent = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat"}, "N",
|
{"--mirostat"}, "N",
|
||||||
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
|
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
|
||||||
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
|
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.mirostat = value;
|
params.sampling.mirostat = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat-lr"}, "N",
|
{"--mirostat-lr"}, "N",
|
||||||
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
|
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.mirostat_eta = std::stof(value);
|
params.sampling.mirostat_eta = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat-ent"}, "N",
|
{"--mirostat-ent"}, "N",
|
||||||
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
|
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.mirostat_tau = std::stof(value);
|
params.sampling.mirostat_tau = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
@ -1100,7 +1019,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
try {
|
try {
|
||||||
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
||||||
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
||||||
params.sparams.logit_bias.push_back({key, bias});
|
params.sampling.logit_bias.push_back({key, bias});
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("invalid input format");
|
throw std::invalid_argument("invalid input format");
|
||||||
}
|
}
|
||||||
|
@ -1111,9 +1030,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--grammar"}, "GRAMMAR",
|
{"--grammar"}, "GRAMMAR",
|
||||||
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
|
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.grammar = value;
|
params.sampling.grammar = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
@ -1127,7 +1046,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
std::copy(
|
std::copy(
|
||||||
std::istreambuf_iterator<char>(file),
|
std::istreambuf_iterator<char>(file),
|
||||||
std::istreambuf_iterator<char>(),
|
std::istreambuf_iterator<char>(),
|
||||||
std::back_inserter(params.sparams.grammar)
|
std::back_inserter(params.sampling.grammar)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
|
@ -1135,7 +1054,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
{"-j", "--json-schema"}, "SCHEMA",
|
{"-j", "--json-schema"}, "SCHEMA",
|
||||||
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.grammar = json_schema_to_grammar(json::parse(value));
|
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
@ -1433,28 +1352,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
else { throw std::invalid_argument("invalid value"); }
|
else { throw std::invalid_argument("invalid value"); }
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NUMA"));
|
).set_env("LLAMA_ARG_NUMA"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-dev", "--device"}, "<dev1,dev2,..>",
|
||||||
|
"comma-separated list of devices to use for offloading (none = don't offload)\n"
|
||||||
|
"use --list-devices to see a list of available devices",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.devices = parse_device_list(value);
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_DEVICE"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--list-devices"},
|
||||||
|
"print list of available devices and exit",
|
||||||
|
[](common_params &) {
|
||||||
|
printf("Available devices:\n");
|
||||||
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
|
auto * dev = ggml_backend_dev_get(i);
|
||||||
|
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||||
|
size_t free, total;
|
||||||
|
ggml_backend_dev_memory(dev, &free, &total);
|
||||||
|
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||||
"number of layers to store in VRAM",
|
"number of layers to store in VRAM",
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_gpu_layers = value;
|
params.n_gpu_layers = value;
|
||||||
if (!llama_supports_gpu_offload()) {
|
if (!llama_supports_gpu_offload()) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||||
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
||||||
"number of layers to store in VRAM for the draft model",
|
|
||||||
[](common_params & params, int value) {
|
|
||||||
params.n_gpu_layers_draft = value;
|
|
||||||
if (!llama_supports_gpu_offload()) {
|
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sm", "--split-mode"}, "{none,layer,row}",
|
{"-sm", "--split-mode"}, "{none,layer,row}",
|
||||||
"how to split the model across multiple GPUs, one of:\n"
|
"how to split the model across multiple GPUs, one of:\n"
|
||||||
|
@ -1468,10 +1401,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
} else if (arg_next == "layer") {
|
} else if (arg_next == "layer") {
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||||
} else if (arg_next == "row") {
|
} else if (arg_next == "row") {
|
||||||
#ifdef GGML_USE_SYCL
|
|
||||||
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
|
||||||
exit(1);
|
|
||||||
#endif // GGML_USE_SYCL
|
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("invalid value");
|
throw std::invalid_argument("invalid value");
|
||||||
|
@ -1593,13 +1522,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.model = value;
|
params.model = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-md", "--model-draft"}, "FNAME",
|
|
||||||
"draft model for speculative decoding (default: unused)",
|
|
||||||
[](common_params & params, const std::string & value) {
|
|
||||||
params.model_draft = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-mu", "--model-url"}, "MODEL_URL",
|
{"-mu", "--model-url"}, "MODEL_URL",
|
||||||
"model download url (default: unused)",
|
"model download url (default: unused)",
|
||||||
|
@ -1911,10 +1833,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chat-template"}, "JINJA_TEMPLATE",
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
||||||
|
string_format(
|
||||||
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
||||||
"if suffix/prefix are specified, template will be disabled\n"
|
"if suffix/prefix are specified, template will be disabled\n"
|
||||||
"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
|
"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
|
||||||
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
|
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
||||||
|
),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
if (!common_chat_verify_template(value, params.use_jinja)) {
|
if (!common_chat_verify_template(value, params.use_jinja)) {
|
||||||
throw std::runtime_error(string_format(
|
throw std::runtime_error(string_format(
|
||||||
|
@ -1974,17 +1898,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.simple_io = true;
|
params.simple_io = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-ld", "--logdir"}, "LOGDIR",
|
|
||||||
"path under which to save YAML logs (no logging if unset)",
|
|
||||||
[](common_params & params, const std::string & value) {
|
|
||||||
params.logdir = value;
|
|
||||||
|
|
||||||
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
|
||||||
params.logdir += DIRECTORY_SEPARATOR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--positive-file"}, "FNAME",
|
{"--positive-file"}, "FNAME",
|
||||||
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
||||||
|
@ -2083,5 +1996,177 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
||||||
|
|
||||||
|
// speculative parameters
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-td", "--threads-draft"}, "N",
|
||||||
|
"number of threads to use during generation (default: same as --threads)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams.n_threads = value;
|
||||||
|
if (params.speculative.cpuparams.n_threads <= 0) {
|
||||||
|
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-tbd", "--threads-batch-draft"}, "N",
|
||||||
|
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams_batch.n_threads = value;
|
||||||
|
if (params.speculative.cpuparams_batch.n_threads <= 0) {
|
||||||
|
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-Cd", "--cpu-mask-draft"}, "M",
|
||||||
|
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||||
|
[](common_params & params, const std::string & mask) {
|
||||||
|
params.speculative.cpuparams.mask_valid = true;
|
||||||
|
if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
|
||||||
|
throw std::invalid_argument("invalid cpumask");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
||||||
|
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
||||||
|
[](common_params & params, const std::string & range) {
|
||||||
|
params.speculative.cpuparams.mask_valid = true;
|
||||||
|
if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
|
||||||
|
throw std::invalid_argument("invalid range");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--cpu-strict-draft"}, "<0|1>",
|
||||||
|
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams.strict_cpu = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--prio-draft"}, "N",
|
||||||
|
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
|
||||||
|
[](common_params & params, int prio) {
|
||||||
|
if (prio < 0 || prio > 3) {
|
||||||
|
throw std::invalid_argument("invalid value");
|
||||||
|
}
|
||||||
|
params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--poll-draft"}, "<0|1>",
|
||||||
|
"Use polling to wait for draft model work (default: same as --poll])",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams.poll = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
||||||
|
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||||
|
[](common_params & params, const std::string & mask) {
|
||||||
|
params.speculative.cpuparams_batch.mask_valid = true;
|
||||||
|
if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
|
||||||
|
throw std::invalid_argument("invalid cpumask");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
||||||
|
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
||||||
|
[](common_params & params, const std::string & range) {
|
||||||
|
params.speculative.cpuparams_batch.mask_valid = true;
|
||||||
|
if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
|
||||||
|
throw std::invalid_argument("invalid cpumask");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--cpu-strict-batch-draft"}, "<0|1>",
|
||||||
|
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams_batch.strict_cpu = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--prio-batch-draft"}, "N",
|
||||||
|
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
|
||||||
|
[](common_params & params, int prio) {
|
||||||
|
if (prio < 0 || prio > 3) {
|
||||||
|
throw std::invalid_argument("invalid value");
|
||||||
|
}
|
||||||
|
params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--poll-batch-draft"}, "<0|1>",
|
||||||
|
"Use polling to wait for draft model work (default: --poll-draft)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.cpuparams_batch.poll = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-max", "--draft", "--draft-n"}, "N",
|
||||||
|
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.n_max = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-min", "--draft-n-min"}, "N",
|
||||||
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.n_min = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-p-split"}, "P",
|
||||||
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.speculative.p_split = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--draft-p-min"}, "P",
|
||||||
|
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.speculative.p_min = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-cd", "--ctx-size-draft"}, "N",
|
||||||
|
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.n_ctx = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||||
|
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||||
|
"use --list-devices to see a list of available devices",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.speculative.devices = parse_device_list(value);
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
||||||
|
"number of layers to store in VRAM for the draft model",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.speculative.n_gpu_layers = value;
|
||||||
|
if (!llama_supports_gpu_offload()) {
|
||||||
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||||
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-md", "--model-draft"}, "FNAME",
|
||||||
|
"draft model for speculative decoding (default: unused)",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.speculative.model = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
|
|
@ -539,11 +539,11 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
||||||
detokenized.end());
|
detokenized.end());
|
||||||
|
|
||||||
buf << "\n" << std::to_string(i)
|
buf << "\n" << std::to_string(i)
|
||||||
<< ":token '" << detokenized << "'"
|
<< ", token '" << detokenized << "'"
|
||||||
<< ":pos " << std::to_string(batch.pos[i])
|
<< ", pos " << std::to_string(batch.pos[i])
|
||||||
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
<< ", n_seq_id " << std::to_string(batch.n_seq_id[i])
|
||||||
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
<< ", seq_id " << std::to_string(batch.seq_id[i][0])
|
||||||
<< ":logits " << std::to_string(batch.logits[i]);
|
<< ", logits " << std::to_string(batch.logits[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
buf << " ]";
|
buf << " ]";
|
||||||
|
@ -654,7 +654,17 @@ bool fs_validate_filename(const std::string & filename) {
|
||||||
|
|
||||||
std::u32string filename_utf32;
|
std::u32string filename_utf32;
|
||||||
try {
|
try {
|
||||||
|
#if defined(__clang__)
|
||||||
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||||
|
# pragma clang diagnostic push
|
||||||
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||||
|
#endif
|
||||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||||
|
|
||||||
|
#if defined(__clang__)
|
||||||
|
# pragma clang diagnostic pop
|
||||||
|
#endif
|
||||||
|
|
||||||
filename_utf32 = converter.from_bytes(filename);
|
filename_utf32 = converter.from_bytes(filename);
|
||||||
|
|
||||||
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
||||||
|
@ -868,9 +878,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
|
|
||||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||||
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||||
} else {
|
} else {
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
}
|
}
|
||||||
|
@ -914,6 +924,12 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
||||||
|
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
|
||||||
|
llama_free_model(model);
|
||||||
|
return iparams;
|
||||||
|
}
|
||||||
|
|
||||||
if (!params.control_vectors.empty()) {
|
if (!params.control_vectors.empty()) {
|
||||||
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
||||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
||||||
|
@ -958,9 +974,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
params.sparams.ignore_eos = false;
|
params.sampling.ignore_eos = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.warmup) {
|
if (params.warmup) {
|
||||||
|
@ -1012,9 +1028,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||||
auto mparams = llama_model_default_params();
|
auto mparams = llama_model_default_params();
|
||||||
|
|
||||||
|
if (!params.devices.empty()) {
|
||||||
|
mparams.devices = params.devices.data();
|
||||||
|
}
|
||||||
if (params.n_gpu_layers != -1) {
|
if (params.n_gpu_layers != -1) {
|
||||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
}
|
}
|
||||||
|
@ -1042,6 +1061,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||||
if (s == "f16") {
|
if (s == "f16") {
|
||||||
return GGML_TYPE_F16;
|
return GGML_TYPE_F16;
|
||||||
}
|
}
|
||||||
|
if (s == "bf16") {
|
||||||
|
return GGML_TYPE_BF16;
|
||||||
|
}
|
||||||
if (s == "q8_0") {
|
if (s == "q8_0") {
|
||||||
return GGML_TYPE_Q8_0;
|
return GGML_TYPE_Q8_0;
|
||||||
}
|
}
|
||||||
|
@ -1369,17 +1391,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
struct llama_model * common_load_model_from_url(
|
||||||
const char * model_url,
|
const std::string & model_url,
|
||||||
const char * path_model,
|
const std::string & local_path,
|
||||||
const char * hf_token,
|
const std::string & hf_token,
|
||||||
const struct llama_model_params & params) {
|
const struct llama_model_params & params) {
|
||||||
// Basic validation of the model_url
|
// Basic validation of the model_url
|
||||||
if (!model_url || strlen(model_url) == 0) {
|
if (model_url.empty()) {
|
||||||
LOG_ERR("%s: invalid model_url\n", __func__);
|
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!common_download_file(model_url, path_model, hf_token)) {
|
if (!common_download_file(model_url, local_path, hf_token)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1390,9 +1412,9 @@ struct llama_model * common_load_model_from_url(
|
||||||
/*.no_alloc = */ true,
|
/*.no_alloc = */ true,
|
||||||
/*.ctx = */ NULL,
|
/*.ctx = */ NULL,
|
||||||
};
|
};
|
||||||
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
||||||
if (!ctx_gguf) {
|
if (!ctx_gguf) {
|
||||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1411,13 +1433,13 @@ struct llama_model * common_load_model_from_url(
|
||||||
// Verify the first split file format
|
// Verify the first split file format
|
||||||
// and extract split URL and PATH prefixes
|
// and extract split URL and PATH prefixes
|
||||||
{
|
{
|
||||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
||||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
||||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1444,14 +1466,14 @@ struct llama_model * common_load_model_from_url(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return llama_load_model_from_file(path_model, params);
|
return llama_load_model_from_file(local_path.c_str(), params);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
const char * repo,
|
const std::string & repo,
|
||||||
const char * model,
|
const std::string & remote_path,
|
||||||
const char * path_model,
|
const std::string & local_path,
|
||||||
const char * hf_token,
|
const std::string & hf_token,
|
||||||
const struct llama_model_params & params) {
|
const struct llama_model_params & params) {
|
||||||
// construct hugging face model url:
|
// construct hugging face model url:
|
||||||
//
|
//
|
||||||
|
@ -1465,27 +1487,27 @@ struct llama_model * common_load_model_from_hf(
|
||||||
std::string model_url = "https://huggingface.co/";
|
std::string model_url = "https://huggingface.co/";
|
||||||
model_url += repo;
|
model_url += repo;
|
||||||
model_url += "/resolve/main/";
|
model_url += "/resolve/main/";
|
||||||
model_url += model;
|
model_url += remote_path;
|
||||||
|
|
||||||
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
struct llama_model * common_load_model_from_url(
|
||||||
const char * /*model_url*/,
|
const std::string & /*model_url*/,
|
||||||
const char * /*path_model*/,
|
const std::string & /*local_path*/,
|
||||||
const char * /*hf_token*/,
|
const std::string & /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct llama_model_params & /*params*/) {
|
||||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
const char * /*repo*/,
|
const std::string & /*repo*/,
|
||||||
const char * /*model*/,
|
const std::string & /*remote_path*/,
|
||||||
const char * /*path_model*/,
|
const std::string & /*local_path*/,
|
||||||
const char * /*hf_token*/,
|
const std::string & /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct llama_model_params & /*params*/) {
|
||||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -1520,6 +1542,66 @@ void common_batch_add(
|
||||||
batch.n_tokens++;
|
batch.n_tokens++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Token utils
|
||||||
|
//
|
||||||
|
|
||||||
|
size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
|
||||||
|
size_t i;
|
||||||
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
|
||||||
|
// check for empty sequences
|
||||||
|
if (a.empty() || b.empty()) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the lengths of the input sequences
|
||||||
|
size_t a_len = a.size();
|
||||||
|
size_t b_len = b.size();
|
||||||
|
|
||||||
|
// initialize the maximum length of the longest common subsequence (LCS)
|
||||||
|
size_t max_length = 0;
|
||||||
|
|
||||||
|
// use two rows instead of a 2D matrix to optimize space
|
||||||
|
std::vector<size_t> prev_row(b_len + 1, 0);
|
||||||
|
std::vector<size_t> curr_row(b_len + 1, 0);
|
||||||
|
|
||||||
|
// iterate through the elements of a
|
||||||
|
for (size_t i = 1; i <= a_len; i++) {
|
||||||
|
// iterate through the elements of b
|
||||||
|
for (size_t j = 1; j <= b_len; j++) {
|
||||||
|
// if elements at the current positions match
|
||||||
|
if (a[i - 1] == b[j - 1]) {
|
||||||
|
// if it's the first element of either sequences, set LCS length to 1
|
||||||
|
if (i == 1 || j == 1) {
|
||||||
|
curr_row[j] = 1;
|
||||||
|
} else {
|
||||||
|
// increment LCS length by 1 compared to the previous element
|
||||||
|
curr_row[j] = prev_row[j - 1] + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update max_length if necessary
|
||||||
|
if (curr_row[j] > max_length) {
|
||||||
|
max_length = curr_row[j];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// reset LCS length if elements don't match
|
||||||
|
curr_row[j] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update the previous row for the next iteration
|
||||||
|
prev_row = curr_row;
|
||||||
|
}
|
||||||
|
|
||||||
|
// return the maximum length of the LCS
|
||||||
|
return max_length;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
@ -1984,216 +2066,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// YAML utils
|
|
||||||
//
|
|
||||||
|
|
||||||
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
|
||||||
if (data.empty()) {
|
|
||||||
fprintf(stream, "%s:\n", prop_name);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stream, "%s: [", prop_name);
|
|
||||||
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
||||||
fprintf(stream, "%e, ", data[i]);
|
|
||||||
}
|
|
||||||
fprintf(stream, "%e]\n", data.back());
|
|
||||||
}
|
|
||||||
|
|
||||||
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
|
||||||
if (data.empty()) {
|
|
||||||
fprintf(stream, "%s:\n", prop_name);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stream, "%s: [", prop_name);
|
|
||||||
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
||||||
fprintf(stream, "%d, ", data[i]);
|
|
||||||
}
|
|
||||||
fprintf(stream, "%d]\n", data.back());
|
|
||||||
}
|
|
||||||
|
|
||||||
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
|
|
||||||
std::string data_str(data == NULL ? "" : data);
|
|
||||||
|
|
||||||
if (data_str.empty()) {
|
|
||||||
fprintf(stream, "%s:\n", prop_name);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t pos_start = 0;
|
|
||||||
size_t pos_found = 0;
|
|
||||||
|
|
||||||
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
|
||||||
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
|
||||||
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
|
||||||
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
|
||||||
data_str = "\"" + data_str + "\"";
|
|
||||||
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data_str.find('\n') == std::string::npos) {
|
|
||||||
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stream, "%s: |\n", prop_name);
|
|
||||||
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
|
||||||
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
|
||||||
pos_start = pos_found + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
||||||
const auto & sparams = params.sparams;
|
|
||||||
|
|
||||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
||||||
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
||||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
|
||||||
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
|
||||||
|
|
||||||
#ifdef NDEBUG
|
|
||||||
fprintf(stream, "debug: false\n");
|
|
||||||
#else
|
|
||||||
fprintf(stream, "debug: true\n");
|
|
||||||
#endif // NDEBUG
|
|
||||||
|
|
||||||
fprintf(stream, "model_desc: %s\n", model_desc);
|
|
||||||
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
|
||||||
|
|
||||||
#ifdef __OPTIMIZE__
|
|
||||||
fprintf(stream, "optimize: true\n");
|
|
||||||
#else
|
|
||||||
fprintf(stream, "optimize: false\n");
|
|
||||||
#endif // __OPTIMIZE__
|
|
||||||
|
|
||||||
fprintf(stream, "time: %s\n", timestamp.c_str());
|
|
||||||
|
|
||||||
fprintf(stream, "\n");
|
|
||||||
fprintf(stream, "###############\n");
|
|
||||||
fprintf(stream, "# User Inputs #\n");
|
|
||||||
fprintf(stream, "###############\n");
|
|
||||||
fprintf(stream, "\n");
|
|
||||||
|
|
||||||
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
||||||
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
||||||
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
||||||
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
||||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
||||||
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
|
|
||||||
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
|
|
||||||
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
|
|
||||||
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
|
|
||||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
|
||||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
|
||||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
|
||||||
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
|
|
||||||
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
||||||
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
||||||
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
||||||
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
|
|
||||||
|
|
||||||
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
||||||
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
||||||
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
||||||
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
||||||
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
||||||
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
||||||
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
||||||
|
|
||||||
fprintf(stream, "logit_bias:\n");
|
|
||||||
for (const auto & logit_bias : sparams.logit_bias) {
|
|
||||||
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stream, "lora:\n");
|
|
||||||
for (auto & la : params.lora_adapters) {
|
|
||||||
if (la.scale == 1.0f) {
|
|
||||||
fprintf(stream, " - %s\n", la.path.c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(stream, "lora_scaled:\n");
|
|
||||||
for (auto & la : params.lora_adapters) {
|
|
||||||
if (la.scale != 1.0f) {
|
|
||||||
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
||||||
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
||||||
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
||||||
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
||||||
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
||||||
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
||||||
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
||||||
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
|
||||||
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
|
||||||
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
|
||||||
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
|
||||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
|
||||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
|
||||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
|
||||||
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
|
|
||||||
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
|
||||||
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
||||||
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
||||||
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
|
|
||||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
||||||
|
|
||||||
fprintf(stream, "reverse_prompt:\n");
|
|
||||||
for (std::string ap : params.antiprompt) {
|
|
||||||
size_t pos = 0;
|
|
||||||
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
|
||||||
ap.replace(pos, 1, "\\n");
|
|
||||||
pos += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stream, " - %s\n", ap.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
|
||||||
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
|
||||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
||||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
||||||
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
||||||
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
||||||
|
|
||||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
|
||||||
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
|
||||||
|
|
||||||
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
|
||||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
|
||||||
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
|
|
||||||
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
|
|
||||||
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
|
||||||
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
|
||||||
}
|
|
||||||
|
|
|
@ -39,6 +39,8 @@ struct common_lora_adapter_container : common_lora_adapter_info {
|
||||||
struct llama_lora_adapter * adapter;
|
struct llama_lora_adapter * adapter;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using llama_tokens = std::vector<llama_token>;
|
||||||
|
|
||||||
// build info
|
// build info
|
||||||
extern int LLAMA_BUILD_NUMBER;
|
extern int LLAMA_BUILD_NUMBER;
|
||||||
extern char const * LLAMA_COMMIT;
|
extern char const * LLAMA_COMMIT;
|
||||||
|
@ -107,8 +109,8 @@ enum dimre_method {
|
||||||
DIMRE_METHOD_MEAN,
|
DIMRE_METHOD_MEAN,
|
||||||
};
|
};
|
||||||
|
|
||||||
// sampler parameters
|
// sampling parameters
|
||||||
struct common_sampler_params {
|
struct common_params_sampling {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||||
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
|
@ -137,6 +139,7 @@ struct common_sampler_params {
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
bool ignore_eos = false;
|
bool ignore_eos = false;
|
||||||
bool no_perf = false; // disable performance metrics
|
bool no_perf = false; // disable performance metrics
|
||||||
|
bool timing_per_token = false;
|
||||||
|
|
||||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||||
|
|
||||||
|
@ -160,21 +163,30 @@ struct common_sampler_params {
|
||||||
std::string print() const;
|
std::string print() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct common_params_speculative {
|
||||||
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||||
|
int32_t n_ctx = 0; // draft context size
|
||||||
|
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||||
|
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
||||||
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
|
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
||||||
|
|
||||||
|
struct cpu_params cpuparams;
|
||||||
|
struct cpu_params cpuparams_batch;
|
||||||
|
|
||||||
|
std::string model = ""; // draft model for speculative decoding // NOLINT
|
||||||
|
};
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 4096; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
||||||
int32_t grp_attn_n = 1; // group-attention factor
|
int32_t grp_attn_n = 1; // group-attention factor
|
||||||
int32_t grp_attn_w = 512; // group-attention width
|
int32_t grp_attn_w = 512; // group-attention width
|
||||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||||
|
@ -185,27 +197,31 @@ struct common_params {
|
||||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
|
// offload params
|
||||||
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||||
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
|
||||||
struct cpu_params cpuparams;
|
struct cpu_params cpuparams;
|
||||||
struct cpu_params cpuparams_batch;
|
struct cpu_params cpuparams_batch;
|
||||||
struct cpu_params draft_cpuparams;
|
|
||||||
struct cpu_params draft_cpuparams_batch;
|
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
||||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
struct common_sampler_params sparams;
|
struct common_params_sampling sampling;
|
||||||
|
struct common_params_speculative speculative;
|
||||||
|
|
||||||
std::string model = ""; // model path // NOLINT
|
std::string model = ""; // model path // NOLINT
|
||||||
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
|
||||||
std::string model_alias = "unknown"; // model alias // NOLINT
|
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||||
std::string model_url = ""; // model url to download // NOLINT
|
std::string model_url = ""; // model url to download // NOLINT
|
||||||
std::string hf_token = ""; // HF token // NOLINT
|
std::string hf_token = ""; // HF token // NOLINT
|
||||||
|
@ -216,7 +232,6 @@ struct common_params {
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||||
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
|
||||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||||
|
@ -462,17 +477,28 @@ struct common_init_result {
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params);
|
struct common_init_result common_init_from_params(common_params & params);
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama (const common_params & params);
|
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * common_load_model_from_url(
|
||||||
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
const std::string & model_url,
|
||||||
|
const std::string & local_path,
|
||||||
|
const std::string & hf_token,
|
||||||
|
const struct llama_model_params & params);
|
||||||
|
struct llama_model * common_load_model_from_hf(
|
||||||
|
const std::string & repo,
|
||||||
|
const std::string & remote_path,
|
||||||
|
const std::string & local_path,
|
||||||
|
const std::string & hf_token,
|
||||||
|
const struct llama_model_params & params);
|
||||||
|
|
||||||
// clear LoRA adapters from context, then apply new list of adapters
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||||
|
|
||||||
|
//
|
||||||
// Batch utils
|
// Batch utils
|
||||||
|
//
|
||||||
|
|
||||||
void common_batch_clear(struct llama_batch & batch);
|
void common_batch_clear(struct llama_batch & batch);
|
||||||
|
|
||||||
|
@ -483,6 +509,16 @@ void common_batch_add(
|
||||||
const std::vector<llama_seq_id> & seq_ids,
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
bool logits);
|
bool logits);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Token utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// longest common prefix
|
||||||
|
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
|
||||||
|
|
||||||
|
// longet common subsequence
|
||||||
|
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
@ -808,15 +844,3 @@ private:
|
||||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
//
|
|
||||||
// YAML utils
|
|
||||||
//
|
|
||||||
|
|
||||||
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
||||||
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
||||||
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
||||||
|
|
||||||
void yaml_dump_non_result_info(
|
|
||||||
FILE * stream, const common_params & params, const llama_context * lctx,
|
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
||||||
|
|
|
@ -99,7 +99,7 @@ struct ring_buffer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_sampler {
|
struct common_sampler {
|
||||||
common_sampler_params params;
|
common_params_sampling params;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
struct llama_sampler * grmr;
|
||||||
struct llama_sampler * chain;
|
struct llama_sampler * chain;
|
||||||
|
@ -125,7 +125,7 @@ struct common_sampler {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string common_sampler_params::print() const {
|
std::string common_params_sampling::print() const {
|
||||||
char result[1024];
|
char result[1024];
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
|
@ -150,7 +150,7 @@ bool common_sampler_trigger_grammar(const struct llama_model * model, common_sam
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
|
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
||||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
lparams.no_perf = params.no_perf;
|
lparams.no_perf = params.no_perf;
|
||||||
|
@ -333,6 +333,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||||
return cur_p.data[cur_p.selected].id;
|
return cur_p.data[cur_p.selected].id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
||||||
|
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||||
|
|
||||||
|
std::vector<llama_token> result;
|
||||||
|
result.reserve(idxs.size());
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
for (; i < draft.size(); i++) {
|
||||||
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||||
|
|
||||||
|
common_sampler_accept(gsmpl, id, true);
|
||||||
|
|
||||||
|
result.push_back(id);
|
||||||
|
|
||||||
|
if (draft[i] != id) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i == draft.size()) {
|
||||||
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||||
|
|
||||||
|
common_sampler_accept(gsmpl, id, true);
|
||||||
|
|
||||||
|
result.push_back(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
||||||
|
std::vector<int> idxs(draft.size() + 1);
|
||||||
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||||
|
idxs[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||||
return llama_sampler_get_seed(gsmpl->chain);
|
return llama_sampler_get_seed(gsmpl->chain);
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,7 +36,7 @@ struct common_sampler;
|
||||||
|
|
||||||
// llama_sampler API overloads
|
// llama_sampler API overloads
|
||||||
|
|
||||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
|
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
|
||||||
|
|
||||||
void common_sampler_free(struct common_sampler * gsmpl);
|
void common_sampler_free(struct common_sampler * gsmpl);
|
||||||
|
|
||||||
|
@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
//
|
//
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||||
|
|
||||||
|
// generalized version of common_sampler_sample
|
||||||
|
//
|
||||||
|
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
|
||||||
|
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
|
||||||
|
//
|
||||||
|
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
|
||||||
|
//
|
||||||
|
// is equivalent to
|
||||||
|
//
|
||||||
|
// common_sampler_sample(gsmpl, ctx, idx);
|
||||||
|
// common_sampler_accept(gsmpl, token, true);
|
||||||
|
//
|
||||||
|
// requires: idxs.size() == draft.size() + 1
|
||||||
|
//
|
||||||
|
// returns at least 1 token, up to idxs.size()
|
||||||
|
//
|
||||||
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
||||||
|
|
||||||
|
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||||
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
270
common/speculative.cpp
Normal file
270
common/speculative.cpp
Normal file
|
@ -0,0 +1,270 @@
|
||||||
|
#include "speculative.h"
|
||||||
|
|
||||||
|
#include "log.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||||
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||||
|
|
||||||
|
struct common_speculative {
|
||||||
|
struct llama_context * ctx;
|
||||||
|
struct common_sampler * smpl;
|
||||||
|
|
||||||
|
llama_batch batch;
|
||||||
|
llama_tokens prompt;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_speculative * common_speculative_init(
|
||||||
|
struct llama_context * ctx_dft) {
|
||||||
|
auto * result = new common_speculative {
|
||||||
|
/* .ctx = */ ctx_dft,
|
||||||
|
/* .smpl = */ nullptr,
|
||||||
|
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
||||||
|
/* .prompt = */ {},
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: optimize or pass from outside?
|
||||||
|
#if 0
|
||||||
|
{
|
||||||
|
common_params_sampling params;
|
||||||
|
params.no_perf = false;
|
||||||
|
|
||||||
|
params.top_k = 40;
|
||||||
|
params.top_p = 0.9;
|
||||||
|
|
||||||
|
params.samplers = {
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_P,
|
||||||
|
COMMON_SAMPLER_TYPE_INFILL,
|
||||||
|
};
|
||||||
|
|
||||||
|
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
common_params_sampling params;
|
||||||
|
params.no_perf = false;
|
||||||
|
|
||||||
|
params.top_k = 10;
|
||||||
|
|
||||||
|
params.samplers = {
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
|
};
|
||||||
|
|
||||||
|
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_speculative_free(struct common_speculative * spec) {
|
||||||
|
common_sampler_free(spec->smpl);
|
||||||
|
|
||||||
|
llama_batch_free(spec->batch);
|
||||||
|
|
||||||
|
delete spec;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool common_speculative_are_compatible(
|
||||||
|
const struct llama_context * ctx_tgt,
|
||||||
|
const struct llama_context * ctx_dft) {
|
||||||
|
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
|
||||||
|
const struct llama_model * model_dft = llama_get_model(ctx_dft);
|
||||||
|
|
||||||
|
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||||
|
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
|
||||||
|
|
||||||
|
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||||
|
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||||
|
|
||||||
|
if (vocab_type_tgt != vocab_type_dft) {
|
||||||
|
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
|
||||||
|
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
|
||||||
|
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
|
||||||
|
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||||
|
llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
|
||||||
|
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||||
|
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
|
||||||
|
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||||
|
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||||
|
|
||||||
|
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
|
||||||
|
|
||||||
|
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||||
|
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
|
||||||
|
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||||
|
__func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||||
|
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||||
|
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||||
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||||
|
LOG_ERR("%s: draft model vocab must match target model to use speculation but "
|
||||||
|
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
|
||||||
|
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||||
|
common_token_to_piece(ctx_dft, i).c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_tokens common_speculative_gen_draft(
|
||||||
|
struct common_speculative * spec,
|
||||||
|
struct common_speculative_params params,
|
||||||
|
const llama_tokens & prompt_tgt,
|
||||||
|
llama_token id_last) {
|
||||||
|
auto & batch = spec->batch;
|
||||||
|
auto & ctx = spec->ctx;
|
||||||
|
auto & smpl = spec->smpl;
|
||||||
|
auto & prompt = spec->prompt;
|
||||||
|
|
||||||
|
int reuse_i = 0;
|
||||||
|
int reuse_n = 0;
|
||||||
|
|
||||||
|
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
|
||||||
|
|
||||||
|
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
||||||
|
|
||||||
|
// reuse as much as possible from the old draft context
|
||||||
|
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
||||||
|
for (int i = 0; i < (int) prompt.size(); ++i) {
|
||||||
|
int cur = 0;
|
||||||
|
while (i_start + cur < (int) prompt_tgt.size() &&
|
||||||
|
i + cur < (int) prompt.size() &&
|
||||||
|
prompt_tgt[i_start + cur] == prompt[i + cur]) {
|
||||||
|
cur++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
|
||||||
|
reuse_i = i;
|
||||||
|
reuse_n = cur;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
|
||||||
|
|
||||||
|
llama_tokens result;
|
||||||
|
result.reserve(params.n_draft);
|
||||||
|
|
||||||
|
if (reuse_n == 0) {
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
prompt.clear();
|
||||||
|
} else {
|
||||||
|
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
||||||
|
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
||||||
|
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
|
||||||
|
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
|
||||||
|
result.push_back(prompt[i]);
|
||||||
|
|
||||||
|
if (params.n_draft <= (int) result.size()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reuse_i > 0) {
|
||||||
|
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
|
||||||
|
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
||||||
|
|
||||||
|
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reuse_n < (int) prompt.size()) {
|
||||||
|
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
|
||||||
|
|
||||||
|
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare a batch to evaluate any new tokens in the prompt
|
||||||
|
common_batch_clear(batch);
|
||||||
|
|
||||||
|
for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
|
||||||
|
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
||||||
|
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
||||||
|
|
||||||
|
prompt.push_back(prompt_tgt[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we should rarely end-up here during normal decoding
|
||||||
|
if (batch.n_tokens > 0) {
|
||||||
|
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||||
|
|
||||||
|
llama_decode(ctx, batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_pos n_past = prompt.size();
|
||||||
|
|
||||||
|
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
||||||
|
|
||||||
|
common_batch_clear(batch);
|
||||||
|
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
||||||
|
|
||||||
|
prompt.push_back(id_last);
|
||||||
|
|
||||||
|
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
|
||||||
|
|
||||||
|
llama_decode(ctx, batch);
|
||||||
|
|
||||||
|
common_sampler_reset(smpl);
|
||||||
|
|
||||||
|
// sample n_draft tokens from the draft model
|
||||||
|
for (int i = 0; i < params.n_draft; ++i) {
|
||||||
|
common_batch_clear(batch);
|
||||||
|
|
||||||
|
common_sampler_sample(smpl, ctx, 0, true);
|
||||||
|
|
||||||
|
const auto * cur_p = common_sampler_get_candidates(smpl);
|
||||||
|
|
||||||
|
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||||
|
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||||
|
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// add drafted token for each sequence
|
||||||
|
const llama_token id = cur_p->data[0].id;
|
||||||
|
|
||||||
|
// only collect very high-confidence draft tokens
|
||||||
|
if (cur_p->data[0].p < params.p_min) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
|
result.push_back(id);
|
||||||
|
|
||||||
|
if (params.n_draft <= (int) result.size()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||||
|
|
||||||
|
// evaluate the drafted tokens on the draft model
|
||||||
|
llama_decode(ctx, batch);
|
||||||
|
|
||||||
|
prompt.push_back(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
28
common/speculative.h
Normal file
28
common/speculative.h
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
struct common_speculative;
|
||||||
|
|
||||||
|
struct common_speculative_params {
|
||||||
|
int n_draft = 16; // max drafted tokens
|
||||||
|
int n_reuse = 256;
|
||||||
|
|
||||||
|
float p_min = 0.9f; // min probabiliy required to accept a token in the draft
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
||||||
|
|
||||||
|
void common_speculative_free(struct common_speculative * spec);
|
||||||
|
|
||||||
|
bool common_speculative_are_compatible(
|
||||||
|
const struct llama_context * ctx_tgt,
|
||||||
|
const struct llama_context * ctx_dft);
|
||||||
|
|
||||||
|
// sample up to n_draft tokens and add them to the batch using the draft model
|
||||||
|
llama_tokens common_speculative_gen_draft(
|
||||||
|
struct common_speculative * spec,
|
||||||
|
struct common_speculative_params params,
|
||||||
|
const llama_tokens & prompt,
|
||||||
|
llama_token id_last);
|
1515
common/train.cpp
1515
common/train.cpp
File diff suppressed because it is too large
Load diff
233
common/train.h
233
common/train.h
|
@ -1,233 +0,0 @@
|
||||||
// Various helper functions and utilities for training
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <random>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#define LLAMA_TRAIN_MAX_NODES 16384
|
|
||||||
|
|
||||||
typedef std::string mt19937_state;
|
|
||||||
|
|
||||||
struct train_state {
|
|
||||||
struct ggml_opt_context * opt;
|
|
||||||
|
|
||||||
uint64_t train_its;
|
|
||||||
uint64_t train_samples;
|
|
||||||
uint64_t train_tokens;
|
|
||||||
uint64_t train_epochs;
|
|
||||||
|
|
||||||
size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
|
|
||||||
mt19937_state shuffle_rng_state_current;
|
|
||||||
mt19937_state shuffle_rng_state_next;
|
|
||||||
size_t shuffle_sample_count;
|
|
||||||
size_t shuffle_next_sample;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct train_params_common {
|
|
||||||
const char * fn_train_data;
|
|
||||||
const char * fn_checkpoint_in;
|
|
||||||
const char * fn_checkpoint_out;
|
|
||||||
const char * pattern_fn_it;
|
|
||||||
const char * fn_latest;
|
|
||||||
|
|
||||||
bool print_usage;
|
|
||||||
|
|
||||||
int save_every;
|
|
||||||
|
|
||||||
uint32_t seed;
|
|
||||||
|
|
||||||
int n_ctx;
|
|
||||||
int n_threads;
|
|
||||||
int n_batch;
|
|
||||||
int n_gradient_accumulation;
|
|
||||||
int n_epochs;
|
|
||||||
int n_gpu_layers;
|
|
||||||
|
|
||||||
bool custom_n_ctx;
|
|
||||||
|
|
||||||
bool use_flash;
|
|
||||||
bool use_checkpointing;
|
|
||||||
|
|
||||||
std::string sample_start;
|
|
||||||
bool include_sample_start;
|
|
||||||
bool escape;
|
|
||||||
bool overlapping_samples;
|
|
||||||
bool fill_with_next_samples;
|
|
||||||
bool separate_with_eos;
|
|
||||||
bool separate_with_bos;
|
|
||||||
bool sample_random_offsets;
|
|
||||||
|
|
||||||
bool force_reshuffle;
|
|
||||||
|
|
||||||
int warmup;
|
|
||||||
int cos_decay_steps;
|
|
||||||
float cos_decay_restart;
|
|
||||||
float cos_decay_min;
|
|
||||||
bool enable_restart;
|
|
||||||
|
|
||||||
int opt_past;
|
|
||||||
float opt_delta;
|
|
||||||
int opt_max_no_improvement;
|
|
||||||
|
|
||||||
int adam_n_iter;
|
|
||||||
float adam_alpha;
|
|
||||||
float adam_min_alpha;
|
|
||||||
float adam_decay;
|
|
||||||
int adam_decay_min_ndim;
|
|
||||||
float adam_beta1;
|
|
||||||
float adam_beta2;
|
|
||||||
float adam_gclip;
|
|
||||||
float adam_eps_f;
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
|
|
||||||
|
|
||||||
struct train_opt_callback_data {
|
|
||||||
struct train_params_common * params;
|
|
||||||
struct train_state * train;
|
|
||||||
save_train_files_callback save_cb;
|
|
||||||
void * save_data;
|
|
||||||
struct llama_context * lctx;
|
|
||||||
int last_save_iter;
|
|
||||||
llama_token * tokens_data;
|
|
||||||
size_t tokens_size;
|
|
||||||
size_t * samples_begin;
|
|
||||||
size_t * samples_size;
|
|
||||||
size_t * shuffled_samples_offs;
|
|
||||||
size_t * shuffled_samples_begin;
|
|
||||||
size_t * shuffled_samples_size;
|
|
||||||
size_t samples_count;
|
|
||||||
struct ggml_tensor * tokens_input;
|
|
||||||
struct ggml_tensor * target_probs;
|
|
||||||
int first_iter;
|
|
||||||
int first_epoch;
|
|
||||||
int iter_at_last_epoch;
|
|
||||||
int64_t last_time;
|
|
||||||
double millis_per_iter;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct train_state * init_train_state();
|
|
||||||
void free_train_state(struct train_state * state);
|
|
||||||
|
|
||||||
struct train_params_common get_default_train_params_common();
|
|
||||||
void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
|
|
||||||
|
|
||||||
bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
|
|
||||||
void finish_processing_train_args(struct train_params_common * params);
|
|
||||||
|
|
||||||
struct random_normal_distribution;
|
|
||||||
struct random_uniform_distribution;
|
|
||||||
|
|
||||||
struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
|
|
||||||
struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
|
|
||||||
|
|
||||||
void free_random_normal_distribution (struct random_normal_distribution * rnd);
|
|
||||||
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
|
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
|
|
||||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
|
|
||||||
|
|
||||||
// generate random float in interval [0,1)
|
|
||||||
float frand();
|
|
||||||
float frand_normal (struct random_normal_distribution * rnd);
|
|
||||||
float frand_uniform(struct random_uniform_distribution * rnd);
|
|
||||||
|
|
||||||
int clamp (const int v, const int min, const int max);
|
|
||||||
float fclamp(const float v, const float min, const float max);
|
|
||||||
|
|
||||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
|
|
||||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
|
|
||||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
|
|
||||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
|
||||||
|
|
||||||
size_t tokenize_file(
|
|
||||||
struct llama_context * lctx,
|
|
||||||
const char * filename,
|
|
||||||
const std::string & sample_start,
|
|
||||||
bool include_sample_start,
|
|
||||||
bool overlapping_samples,
|
|
||||||
unsigned context_length,
|
|
||||||
std::vector<llama_token> & out_tokens,
|
|
||||||
std::vector<size_t> & out_samples_begin,
|
|
||||||
std::vector<size_t> & out_samples_size);
|
|
||||||
|
|
||||||
int64_t get_example_targets_batch(
|
|
||||||
struct llama_context * lctx,
|
|
||||||
struct ggml_tensor * tokens_input,
|
|
||||||
struct ggml_tensor * target_probs,
|
|
||||||
int64_t example_id,
|
|
||||||
const size_t * samples_offs,
|
|
||||||
const size_t * samples_begin,
|
|
||||||
const size_t * samples_size,
|
|
||||||
size_t samples_count,
|
|
||||||
const llama_token * train_data,
|
|
||||||
size_t n_train_data,
|
|
||||||
bool separate_with_eos,
|
|
||||||
bool separate_with_bos,
|
|
||||||
bool fill_with_next_samples,
|
|
||||||
bool sample_random_offsets);
|
|
||||||
|
|
||||||
|
|
||||||
void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
|
|
||||||
mt19937_state mt19937_get_state(const std::mt19937& rng);
|
|
||||||
mt19937_state mt19937_seed_to_state(unsigned seed);
|
|
||||||
|
|
||||||
mt19937_state shuffle_samples(
|
|
||||||
const mt19937_state & rng_state,
|
|
||||||
size_t * shuffled_offs,
|
|
||||||
size_t * shuffled_begins,
|
|
||||||
size_t * shuffled_sizes,
|
|
||||||
const size_t * begins,
|
|
||||||
const size_t * sizes,
|
|
||||||
size_t count);
|
|
||||||
|
|
||||||
size_t hash_combine(size_t h1, size_t h2);
|
|
||||||
|
|
||||||
size_t compute_samples_hash(
|
|
||||||
const char* fn,
|
|
||||||
const size_t* samples_begin,
|
|
||||||
const size_t* samples_size,
|
|
||||||
size_t sample_count);
|
|
||||||
|
|
||||||
|
|
||||||
std::string replace_str(const char * s, const char * needle, const char * replacement);
|
|
||||||
|
|
||||||
void print_duration(double milliseconds);
|
|
||||||
|
|
||||||
float cosine_decay(
|
|
||||||
int64_t step,
|
|
||||||
int64_t decay_steps,
|
|
||||||
float minimum);
|
|
||||||
|
|
||||||
float cosine_decay_restart(
|
|
||||||
int64_t step,
|
|
||||||
int64_t decay_steps,
|
|
||||||
float minimum,
|
|
||||||
float restart_step_mult);
|
|
||||||
|
|
||||||
float learning_schedule(
|
|
||||||
int64_t step,
|
|
||||||
int64_t warmup_steps,
|
|
||||||
int64_t decay_steps,
|
|
||||||
float learning_rate,
|
|
||||||
float overall_minimum,
|
|
||||||
float cos_decay_minimum,
|
|
||||||
float cos_decay_restart_step_mult,
|
|
||||||
bool enable_restart);
|
|
||||||
|
|
||||||
void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
|
|
||||||
|
|
||||||
void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
|
|
||||||
void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
|
|
||||||
|
|
||||||
bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
|
|
||||||
void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
|
|
||||||
|
|
||||||
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
|
||||||
|
|
||||||
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
|
|
|
@ -72,7 +72,8 @@ class Model:
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||||
use_temp_file: bool = False, eager: bool = False,
|
use_temp_file: bool = False, eager: bool = False,
|
||||||
metadata_override: Path | None = None, model_name: str | None = None,
|
metadata_override: Path | None = None, model_name: str | None = None,
|
||||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||||
|
small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
|
|
||||||
|
@ -87,7 +88,7 @@ class Model:
|
||||||
self.is_safetensors = len(self.part_names) > 0
|
self.is_safetensors = len(self.part_names) > 0
|
||||||
if not self.is_safetensors:
|
if not self.is_safetensors:
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
|
@ -657,6 +658,9 @@ class Model:
|
||||||
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
|
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
|
||||||
# ref: https://huggingface.co/facebook/chameleon-7b
|
# ref: https://huggingface.co/facebook/chameleon-7b
|
||||||
res = "chameleon"
|
res = "chameleon"
|
||||||
|
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
||||||
|
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
||||||
|
res = "minerva-7b"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -1541,6 +1545,17 @@ class LlamaModel(Model):
|
||||||
special_vocab._set_special_token("eot", 32010)
|
special_vocab._set_special_token("eot", 32010)
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||||
|
if tokenizer_config_file.is_file():
|
||||||
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_config_json = json.load(f)
|
||||||
|
if "add_prefix_space" in tokenizer_config_json:
|
||||||
|
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||||
|
|
||||||
|
# Apply to granite small models only
|
||||||
|
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
@ -1557,17 +1572,6 @@ class LlamaModel(Model):
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
|
||||||
if tokenizer_config_file.is_file():
|
|
||||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_config_json = json.load(f)
|
|
||||||
if "add_prefix_space" in tokenizer_config_json:
|
|
||||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
|
||||||
|
|
||||||
# Apply to granite small models only
|
|
||||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
@ -1830,29 +1834,40 @@ class MiniCPMModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.MINICPM
|
model_arch = gguf.MODEL_ARCH.MINICPM
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
block_count = self.hparams["num_hidden_layers"]
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
embedding_scale = float(self.hparams["scale_emb"])
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
self.gguf_writer.add_embedding_scale(embedding_scale)
|
||||||
self.gguf_writer.add_block_count(block_count)
|
logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
self.gguf_writer.add_residual_scale(residual_scale)
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
|
||||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
self.gguf_writer.add_logit_scale(logit_scale)
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
||||||
|
if self.hparams.get("rope_scaling") is not None:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "longrope":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
||||||
|
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
||||||
|
|
||||||
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
|
|
||||||
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||||
|
if rope_scaling is not None:
|
||||||
|
long_factors = rope_scaling.get('long_factor', None)
|
||||||
|
short_factors = rope_scaling.get('short_factor', None)
|
||||||
|
|
||||||
|
if long_factors is None or short_factors is None:
|
||||||
|
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||||
|
|
||||||
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||||
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||||
|
|
||||||
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
||||||
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_llama_hf()
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
|
||||||
n_head //= n_kv_head
|
|
||||||
|
|
||||||
return (
|
|
||||||
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(weights.shape)
|
|
||||||
)
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
@ -1862,9 +1877,9 @@ class MiniCPMModel(Model):
|
||||||
|
|
||||||
# HF models permute some of the tensors, so we need to undo that
|
# HF models permute some of the tensors, so we need to undo that
|
||||||
if name.endswith(("q_proj.weight")):
|
if name.endswith(("q_proj.weight")):
|
||||||
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith(("k_proj.weight")):
|
if name.endswith(("k_proj.weight")):
|
||||||
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
@ -2706,7 +2721,7 @@ class XLMRobertaModel(BertModel):
|
||||||
self.gguf_writer.add_token_scores(scores)
|
self.gguf_writer.add_token_scores(scores)
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
self.gguf_writer.add_add_space_prefix(add_prefix)
|
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||||
self.gguf_writer.add_token_type_count(1)
|
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||||
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
||||||
if precompiled_charsmap:
|
if precompiled_charsmap:
|
||||||
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
||||||
|
@ -3039,6 +3054,11 @@ class OlmoModel(Model):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Olmo2ForCausalLM")
|
||||||
|
class Olmo2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.OLMO2
|
||||||
|
|
||||||
|
|
||||||
@Model.register("OlmoeForCausalLM")
|
@Model.register("OlmoeForCausalLM")
|
||||||
class OlmoeModel(Model):
|
class OlmoeModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.OLMOE
|
model_arch = gguf.MODEL_ARCH.OLMOE
|
||||||
|
@ -3747,10 +3767,7 @@ class JaisModel(Model):
|
||||||
|
|
||||||
# Embeddings scale
|
# Embeddings scale
|
||||||
self.embeddings_scale = 1.0
|
self.embeddings_scale = 1.0
|
||||||
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
|
||||||
self.output_is_wte = False
|
|
||||||
if 'mup_embeddings_scale' in self.hparams:
|
if 'mup_embeddings_scale' in self.hparams:
|
||||||
self.output_is_wte = True # Hack (?)
|
|
||||||
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
||||||
elif 'embeddings_scale' in self.hparams:
|
elif 'embeddings_scale' in self.hparams:
|
||||||
self.embeddings_scale = self.hparams['embeddings_scale']
|
self.embeddings_scale = self.hparams['embeddings_scale']
|
||||||
|
@ -3807,10 +3824,7 @@ class JaisModel(Model):
|
||||||
|
|
||||||
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
||||||
tensors.append((new_name, data_torch * self.embeddings_scale))
|
tensors.append((new_name, data_torch * self.embeddings_scale))
|
||||||
if self.output_is_wte:
|
|
||||||
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
|
||||||
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
||||||
assert not self.output_is_wte
|
|
||||||
tensors.append((new_name, data_torch * self.width_scale))
|
tensors.append((new_name, data_torch * self.width_scale))
|
||||||
else:
|
else:
|
||||||
tensors.append((new_name, data_torch))
|
tensors.append((new_name, data_torch))
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
#
|
#
|
||||||
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
||||||
#
|
#
|
||||||
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
|
||||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||||
#
|
#
|
||||||
# TODO: generate tokenizer tests for llama.cpp
|
# TODO: generate tokenizer tests for llama.cpp
|
||||||
|
@ -102,6 +102,7 @@ models = [
|
||||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||||
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||||
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
||||||
|
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ import json
|
||||||
from math import prod
|
from math import prod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
||||||
|
from transformers import AutoConfig
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
@ -256,8 +257,8 @@ def parse_args() -> argparse.Namespace:
|
||||||
help="only print out what will be done, without writing any new files",
|
help="only print out what will be done, without writing any new files",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--base", type=Path, required=True,
|
"--base", type=Path,
|
||||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
|
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"lora_path", type=Path,
|
"lora_path", type=Path,
|
||||||
|
@ -267,6 +268,12 @@ def parse_args() -> argparse.Namespace:
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
|
# normally, adapter does not come with base model config, we need to load it from AutoConfig
|
||||||
|
config = AutoConfig.from_pretrained(hf_model_id)
|
||||||
|
return config.to_dict()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
@ -281,7 +288,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
ftype = ftype_map[args.outtype]
|
ftype = ftype_map[args.outtype]
|
||||||
|
|
||||||
dir_base_model: Path = args.base
|
dir_base_model: Path | None = args.base
|
||||||
dir_lora: Path = args.lora_path
|
dir_lora: Path = args.lora_path
|
||||||
lora_config = dir_lora / "adapter_config.json"
|
lora_config = dir_lora / "adapter_config.json"
|
||||||
input_model = dir_lora / "adapter_model.safetensors"
|
input_model = dir_lora / "adapter_model.safetensors"
|
||||||
|
@ -301,9 +308,29 @@ if __name__ == '__main__':
|
||||||
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||||
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||||
|
|
||||||
|
# load LoRA config
|
||||||
|
with open(lora_config, "r") as f:
|
||||||
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
# load base model
|
# load base model
|
||||||
|
if dir_base_model is None:
|
||||||
|
if "base_model_name_or_path" in lparams:
|
||||||
|
model_id = lparams["base_model_name_or_path"]
|
||||||
|
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||||
|
try:
|
||||||
|
hparams = load_hparams_from_hf(model_id)
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Failed to load base model config: {e}")
|
||||||
|
logger.error("Please try downloading the base model and add its path to --base")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
logger.error("'base_model_name_or_path' is not found in adapter_config.json")
|
||||||
|
logger.error("Base model config is required. Please download the base model and add its path to --base")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||||
hparams = Model.load_hparams(dir_base_model)
|
hparams = Model.load_hparams(dir_base_model)
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
try:
|
try:
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
@ -323,13 +350,15 @@ if __name__ == '__main__':
|
||||||
self.dir_model_card = dir_lora_model
|
self.dir_model_card = dir_lora_model
|
||||||
self.lora_alpha = float(lora_alpha)
|
self.lora_alpha = float(lora_alpha)
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
pass
|
||||||
|
|
||||||
def set_type(self):
|
def set_type(self):
|
||||||
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
||||||
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||||
super().set_gguf_parameters()
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
||||||
|
@ -350,7 +379,7 @@ if __name__ == '__main__':
|
||||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||||
logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
|
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if base_name in tensor_map:
|
if base_name in tensor_map:
|
||||||
|
@ -384,9 +413,6 @@ if __name__ == '__main__':
|
||||||
yield (dest_name + ".lora_a", lora_a)
|
yield (dest_name + ".lora_a", lora_a)
|
||||||
yield (dest_name + ".lora_b", lora_b)
|
yield (dest_name + ".lora_b", lora_b)
|
||||||
|
|
||||||
with open(lora_config, "r") as f:
|
|
||||||
lparams: dict[str, Any] = json.load(f)
|
|
||||||
|
|
||||||
alpha: float = lparams["lora_alpha"]
|
alpha: float = lparams["lora_alpha"]
|
||||||
|
|
||||||
model_instance = LoraModel(
|
model_instance = LoraModel(
|
||||||
|
@ -399,6 +425,7 @@ if __name__ == '__main__':
|
||||||
dry_run=args.dry_run,
|
dry_run=args.dry_run,
|
||||||
dir_lora_model=dir_lora,
|
dir_lora_model=dir_lora,
|
||||||
lora_alpha=alpha,
|
lora_alpha=alpha,
|
||||||
|
hparams=hparams,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
|
|
|
@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
|
||||||
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
|
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
||||||
```
|
```
|
||||||
|
|
||||||
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
||||||
|
|
||||||
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
||||||
|
|
||||||
|
|
|
@ -27,13 +27,6 @@ We recommend using openmp since it's easier to modify the cores being used.
|
||||||
|
|
||||||
### llama.cpp compilation
|
### llama.cpp compilation
|
||||||
|
|
||||||
Makefile:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
make GGML_BLIS=1 -j
|
|
||||||
# make GGML_BLIS=1 llama-benchmark-matmult
|
|
||||||
```
|
|
||||||
|
|
||||||
CMake:
|
CMake:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
|
@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.11
|
||||||
|
- Support F16 and F32 data type model for Ascend 310P NPU.
|
||||||
- 2024.8
|
- 2024.8
|
||||||
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
|
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
|
||||||
- 2024.7
|
- 2024.7
|
||||||
|
@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||||
### Ascend NPU
|
### Ascend NPU
|
||||||
|
|
||||||
**Verified devices**
|
**Verified devices**
|
||||||
|
|
||||||
| Ascend NPU | Status |
|
| Ascend NPU | Status |
|
||||||
|:-----------------------------:|:-------:|
|
|:-----------------------------:|:-------:|
|
||||||
| Atlas 300T A2 | Support |
|
| Atlas 300T A2 | Support |
|
||||||
|
| Atlas 300I Duo | Support |
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
|
|
|
@ -34,13 +34,16 @@ The SYCL backend would be broken by some PRs due to no online CI.
|
||||||
|
|
||||||
The following release is verified with good quality:
|
The following release is verified with good quality:
|
||||||
|
|
||||||
|Commit ID|Tag|Release|Verified Platform|
|
|Commit ID|Tag|Release|Verified Platform| Update date|
|
||||||
|-|-|-|-|
|
|-|-|-|-|-|
|
||||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||||
|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||||
|
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.11
|
||||||
|
- Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
|
||||||
|
|
||||||
- 2024.8
|
- 2024.8
|
||||||
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||||
|
@ -310,12 +313,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
|
||||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||||
|
|
||||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||||
|
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
|
||||||
|
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
# Option 2: Use FP16
|
# Option 2: Use FP16
|
||||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
|
@ -333,8 +338,9 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE
|
||||||
|
|
||||||
## AMD
|
## AMD
|
||||||
# Use FP32, FP16 is not supported
|
# Use FP32, FP16 is not supported
|
||||||
# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:'
|
# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
|
||||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
|
||||||
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
|
@ -377,7 +383,7 @@ found 2 SYCL devices:
|
||||||
|
|
||||||
|Chosen Device ID|Setting|
|
|Chosen Device ID|Setting|
|
||||||
|-|-|
|
|-|-|
|
||||||
|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|
|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action|
|
||||||
|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
||||||
|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
|
|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
|
||||||
|
|
||||||
|
@ -644,6 +650,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|--------------------|---------------------------------------|---------------------------------------------|
|
|--------------------|---------------------------------------|---------------------------------------------|
|
||||||
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
|
||||||
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
||||||
|
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
||||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||||
|
|
226
docs/build.md
226
docs/build.md
|
@ -7,33 +7,11 @@ git clone https://github.com/ggerganov/llama.cpp
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
In order to build llama.cpp you have four different options.
|
The following sections describe how to build with different backends and options.
|
||||||
|
|
||||||
- Using `make`:
|
## CPU Build
|
||||||
- On Linux or MacOS:
|
|
||||||
|
|
||||||
```bash
|
Build llama.cpp using `CMake`:
|
||||||
make
|
|
||||||
```
|
|
||||||
|
|
||||||
- On Windows (x86/x64 only, arm64 requires cmake):
|
|
||||||
|
|
||||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
|
||||||
2. Extract `w64devkit` on your pc.
|
|
||||||
3. Run `w64devkit.exe`.
|
|
||||||
4. Use the `cd` command to reach the `llama.cpp` folder.
|
|
||||||
5. From here you can run:
|
|
||||||
```bash
|
|
||||||
make
|
|
||||||
```
|
|
||||||
|
|
||||||
- Notes:
|
|
||||||
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
|
|
||||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
|
||||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
|
||||||
- For debug builds, run `make LLAMA_DEBUG=1`
|
|
||||||
|
|
||||||
- Using `CMake`:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build
|
cmake -B build
|
||||||
|
@ -42,9 +20,8 @@ In order to build llama.cpp you have four different options.
|
||||||
|
|
||||||
**Notes**:
|
**Notes**:
|
||||||
|
|
||||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
- For faster repeated compilation, install [ccache](https://ccache.dev/)
|
||||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
|
||||||
- For debug builds, there are two cases:
|
- For debug builds, there are two cases:
|
||||||
|
|
||||||
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
||||||
|
@ -60,6 +37,14 @@ In order to build llama.cpp you have four different options.
|
||||||
cmake -B build -G "Xcode"
|
cmake -B build -G "Xcode"
|
||||||
cmake --build build --config Debug
|
cmake --build build --config Debug
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
|
||||||
|
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
|
||||||
|
```
|
||||||
|
cmake -B build -DBUILD_SHARED_LIBS=OFF
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
|
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
|
||||||
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
|
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
|
||||||
- Tab Workload: Desktop-development with C++
|
- Tab Workload: Desktop-development with C++
|
||||||
|
@ -70,61 +55,20 @@ In order to build llama.cpp you have four different options.
|
||||||
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
|
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
|
||||||
cmake --build build-arm64-windows-llvm-release
|
cmake --build build-arm64-windows-llvm-release
|
||||||
```
|
```
|
||||||
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
|
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
|
||||||
|
|
||||||
- Using `gmake` (FreeBSD):
|
|
||||||
|
|
||||||
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
|
|
||||||
2. Add your user to **video** group
|
|
||||||
3. Install compilation dependencies.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
|
|
||||||
|
|
||||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
|
||||||
```
|
|
||||||
|
|
||||||
## Metal Build
|
|
||||||
|
|
||||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
|
||||||
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
|
|
||||||
|
|
||||||
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
|
||||||
argument.
|
|
||||||
|
|
||||||
## BLAS Build
|
## BLAS Build
|
||||||
|
|
||||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
|
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:
|
||||||
|
|
||||||
### Accelerate Framework:
|
### Accelerate Framework
|
||||||
|
|
||||||
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
|
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
|
||||||
|
|
||||||
### OpenBLAS:
|
### OpenBLAS
|
||||||
|
|
||||||
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
|
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
|
||||||
|
|
||||||
- Using `make`:
|
|
||||||
- On Linux:
|
|
||||||
```bash
|
|
||||||
make GGML_OPENBLAS=1
|
|
||||||
```
|
|
||||||
|
|
||||||
- On Windows:
|
|
||||||
|
|
||||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
|
||||||
2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
|
|
||||||
3. Extract `w64devkit` on your pc.
|
|
||||||
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
|
|
||||||
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
|
|
||||||
6. Run `w64devkit.exe`.
|
|
||||||
7. Use the `cd` command to reach the `llama.cpp` folder.
|
|
||||||
8. From here you can run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
make GGML_OPENBLAS=1
|
|
||||||
```
|
|
||||||
|
|
||||||
- Using `CMake` on Linux:
|
- Using `CMake` on Linux:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -136,14 +80,6 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
|
||||||
|
|
||||||
Check [BLIS.md](./backend/BLIS.md) for more information.
|
Check [BLIS.md](./backend/BLIS.md) for more information.
|
||||||
|
|
||||||
### SYCL
|
|
||||||
|
|
||||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
|
||||||
|
|
||||||
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
|
||||||
|
|
||||||
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
|
||||||
|
|
||||||
### Intel oneMKL
|
### Intel oneMKL
|
||||||
|
|
||||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
||||||
|
@ -161,16 +97,29 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
|
||||||
|
|
||||||
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
||||||
|
|
||||||
### CUDA
|
### Other BLAS libraries
|
||||||
|
|
||||||
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors.
|
||||||
|
|
||||||
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
|
## Metal Build
|
||||||
|
|
||||||
|
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||||
|
To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option.
|
||||||
|
|
||||||
|
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument.
|
||||||
|
|
||||||
|
## SYCL
|
||||||
|
|
||||||
|
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||||
|
|
||||||
|
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||||
|
|
||||||
|
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
||||||
|
|
||||||
|
## CUDA
|
||||||
|
|
||||||
|
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
|
||||||
|
|
||||||
- Using `make`:
|
|
||||||
```bash
|
|
||||||
make GGML_CUDA=1
|
|
||||||
```
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -186,24 +135,16 @@ The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
|
||||||
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
|
||||||
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
|
||||||
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
||||||
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
||||||
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
|
||||||
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
||||||
|
|
||||||
### MUSA
|
## MUSA
|
||||||
|
|
||||||
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
|
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
|
||||||
|
|
||||||
- Using `make`:
|
|
||||||
```bash
|
|
||||||
make GGML_MUSA=1
|
|
||||||
```
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -217,20 +158,16 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
|
||||||
|
|
||||||
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
||||||
|
|
||||||
### hipBLAS
|
## HIP
|
||||||
|
|
||||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
This provides GPU acceleration on HIP-supported AMD GPUs.
|
||||||
Make sure to have ROCm installed.
|
Make sure to have ROCm installed.
|
||||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
|
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
|
||||||
|
|
||||||
- Using `make`:
|
|
||||||
```bash
|
|
||||||
make GGML_HIPBLAS=1
|
|
||||||
```
|
|
||||||
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||||
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build --config Release -- -j 16
|
&& cmake --build build --config Release -- -j 16
|
||||||
```
|
```
|
||||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
||||||
|
@ -247,19 +184,14 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
||||||
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
||||||
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build -- -j 16
|
&& cmake --build build -- -j 16
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
|
||||||
```bash
|
|
||||||
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
|
||||||
```
|
|
||||||
|
|
||||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
set PATH=%HIP_PATH%\bin;%PATH%
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
||||||
cmake --build build
|
cmake --build build
|
||||||
```
|
```
|
||||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
|
@ -268,23 +200,16 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||||
|
|
||||||
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
||||||
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
## Vulkan
|
||||||
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
||||||
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
|
||||||
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
|
||||||
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
|
||||||
|
|
||||||
### Vulkan
|
|
||||||
|
|
||||||
**Windows**
|
**Windows**
|
||||||
|
|
||||||
#### w64devkit
|
### w64devkit
|
||||||
|
|
||||||
Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
|
||||||
|
|
||||||
Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
|
Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
|
||||||
|
|
||||||
Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
|
Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
|
||||||
```sh
|
```sh
|
||||||
|
@ -300,9 +225,37 @@ Libs: -lvulkan-1
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
```
|
```
|
||||||
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
|
|
||||||
|
|
||||||
#### MSYS2
|
Switch into the `llama.cpp` directory and build using CMake.
|
||||||
|
```sh
|
||||||
|
cmake -B build -DGGML_VULKAN=ON
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
### Git Bash MINGW64
|
||||||
|
|
||||||
|
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
|
||||||
|
|
||||||
|
Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
|
||||||
|
|
||||||
|
Download and install [`CMake`](https://cmake.org/download/) with the default settings
|
||||||
|
|
||||||
|
Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
|
||||||
|
|
||||||
|
Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
|
||||||
|
|
||||||
|
```
|
||||||
|
cmake -B build -DGGML_VULKAN=ON
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
Now you can load the model in conversation mode using `Vulkan`
|
||||||
|
|
||||||
|
```sh
|
||||||
|
build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
|
||||||
|
```
|
||||||
|
|
||||||
|
### MSYS2
|
||||||
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
||||||
```sh
|
```sh
|
||||||
pacman -S git \
|
pacman -S git \
|
||||||
|
@ -311,7 +264,8 @@ Install [MSYS2](https://www.msys2.org/) and then run the following commands in a
|
||||||
mingw-w64-ucrt-x86_64-vulkan-devel \
|
mingw-w64-ucrt-x86_64-vulkan-devel \
|
||||||
mingw-w64-ucrt-x86_64-shaderc
|
mingw-w64-ucrt-x86_64-shaderc
|
||||||
```
|
```
|
||||||
Switch into `llama.cpp` directory and build using CMake.
|
|
||||||
|
Switch into the `llama.cpp` directory and build using CMake.
|
||||||
```sh
|
```sh
|
||||||
cmake -B build -DGGML_VULKAN=ON
|
cmake -B build -DGGML_VULKAN=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
|
@ -360,7 +314,7 @@ cmake --build build --config Release
|
||||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||||
```
|
```
|
||||||
|
|
||||||
### CANN
|
## CANN
|
||||||
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
|
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
|
||||||
|
|
||||||
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
|
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
|
||||||
|
@ -375,22 +329,26 @@ cmake --build build --config release
|
||||||
|
|
||||||
You can test with:
|
You can test with:
|
||||||
|
|
||||||
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
|
||||||
|
|
||||||
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
|
|
||||||
```bash
|
```bash
|
||||||
llm_load_tensors: CANN buffer size = 13313.00 MiB
|
./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32
|
||||||
|
```
|
||||||
|
|
||||||
|
If the following info is output on screen, you are using `llama.cpp` with the CANN backend:
|
||||||
|
```bash
|
||||||
|
llm_load_tensors: CANN model buffer size = 13313.00 MiB
|
||||||
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
||||||
```
|
```
|
||||||
|
|
||||||
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
||||||
|
|
||||||
### Android
|
## Android
|
||||||
|
|
||||||
To read documentation for how to build on Android, [click here](./android.md)
|
To read documentation for how to build on Android, [click here](./android.md)
|
||||||
|
|
||||||
### Arm CPU optimized mulmat kernels
|
## Notes about GPU-accelerated backends
|
||||||
|
|
||||||
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
|
||||||
|
|
||||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
|
||||||
|
|
||||||
|
Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.
|
||||||
|
|
|
@ -6,20 +6,20 @@ find_package(Threads REQUIRED)
|
||||||
|
|
||||||
# ...
|
# ...
|
||||||
|
|
||||||
|
# flags
|
||||||
|
|
||||||
|
llama_add_compile_flags()
|
||||||
|
|
||||||
# examples
|
# examples
|
||||||
|
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
add_subdirectory(cvector-generator)
|
|
||||||
add_subdirectory(baby-llama)
|
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
add_subdirectory(export-lora)
|
|
||||||
add_subdirectory(gbnf-validator)
|
add_subdirectory(gbnf-validator)
|
||||||
add_subdirectory(gguf-hash)
|
add_subdirectory(gguf-hash)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
@ -28,27 +28,36 @@ else()
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(imatrix)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(main)
|
add_subdirectory(main)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(passkey)
|
add_subdirectory(passkey)
|
||||||
add_subdirectory(perplexity)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(quantize-stats)
|
|
||||||
add_subdirectory(quantize)
|
add_subdirectory(quantize)
|
||||||
add_subdirectory(retrieval)
|
add_subdirectory(retrieval)
|
||||||
if (GGML_RPC)
|
|
||||||
add_subdirectory(rpc)
|
|
||||||
endif()
|
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (LLAMA_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
|
add_subdirectory(save-load-state)
|
||||||
|
add_subdirectory(run)
|
||||||
|
add_subdirectory(simple)
|
||||||
|
add_subdirectory(simple-chat)
|
||||||
|
add_subdirectory(speculative)
|
||||||
|
add_subdirectory(speculative-simple)
|
||||||
|
add_subdirectory(tokenize)
|
||||||
|
if (NOT GGML_BACKEND_DL)
|
||||||
|
# these examples use the backends directly and cannot be built with dynamic loading
|
||||||
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
|
add_subdirectory(cvector-generator)
|
||||||
|
add_subdirectory(export-lora)
|
||||||
|
add_subdirectory(quantize-stats)
|
||||||
|
add_subdirectory(llava)
|
||||||
|
if (GGML_RPC)
|
||||||
|
add_subdirectory(rpc)
|
||||||
|
endif()
|
||||||
if (GGML_SYCL)
|
if (GGML_SYCL)
|
||||||
add_subdirectory(sycl)
|
add_subdirectory(sycl)
|
||||||
endif()
|
endif()
|
||||||
add_subdirectory(save-load-state)
|
endif()
|
||||||
add_subdirectory(simple)
|
|
||||||
add_subdirectory(speculative)
|
|
||||||
add_subdirectory(tokenize)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
set(TARGET llama-baby-llama)
|
|
||||||
add_executable(${TARGET} baby-llama.cpp)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,61 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# Few-shot translation example.
|
|
||||||
# Requires a base model (i.e. no fine-tuned or instruct models).
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
#
|
|
||||||
# cd llama.cpp
|
|
||||||
# make -j
|
|
||||||
#
|
|
||||||
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
|
|
||||||
#
|
|
||||||
|
|
||||||
if [ $# -lt 2 ]; then
|
|
||||||
echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
eargs=""
|
|
||||||
if [ $# -gt 2 ]; then
|
|
||||||
eargs="${@:3}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
ftmp="__llama.cpp_example_tmp__.txt"
|
|
||||||
trap "rm -f $ftmp" EXIT
|
|
||||||
|
|
||||||
echo "Translate from English to French:
|
|
||||||
|
|
||||||
===
|
|
||||||
|
|
||||||
sea otter, peppermint, plush girafe:
|
|
||||||
|
|
||||||
sea otter => loutre de mer
|
|
||||||
peppermint => menthe poivrée
|
|
||||||
plush girafe => girafe peluche
|
|
||||||
|
|
||||||
===
|
|
||||||
|
|
||||||
violin
|
|
||||||
|
|
||||||
violin => violon
|
|
||||||
|
|
||||||
===
|
|
||||||
|
|
||||||
phone, computer, mouse, keyboard:
|
|
||||||
|
|
||||||
phone => téléphone
|
|
||||||
computer => ordinateur
|
|
||||||
mouse => souris
|
|
||||||
keyboard => clavier
|
|
||||||
|
|
||||||
===
|
|
||||||
" > $ftmp
|
|
||||||
|
|
||||||
echo "$2
|
|
||||||
" >> $ftmp
|
|
||||||
|
|
||||||
model=$1
|
|
||||||
|
|
||||||
# generate the most likely continuation until the string "===" is found
|
|
||||||
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
|
||||||
add_executable(${TARGET} batched-bench.cpp)
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-batched)
|
||||||
add_executable(${TARGET} batched.cpp)
|
add_executable(${TARGET} batched.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -68,10 +68,10 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
|
|
@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
|
||||||
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
|
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
|
||||||
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
|
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
|
||||||
|
|
||||||
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
|
SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
|
||||||
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
|
'|'\
|
||||||
|
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
|
||||||
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
|
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
|
||||||
|
|
||||||
CTX_SIZE=2048
|
CTX_SIZE=2048
|
||||||
|
@ -129,15 +130,12 @@ while read -e line; do
|
||||||
|
|
||||||
printf ' '
|
printf ' '
|
||||||
|
|
||||||
# HACK get num tokens from debug message
|
if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
|
||||||
# TODO get both messages in one go
|
|
||||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
|
||||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
|
||||||
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
|
n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
|
||||||
|
|
||||||
if ((n_tokens > CTX_ROTATE_POINT)); then
|
if ((n_tokens > CTX_ROTATE_POINT)); then
|
||||||
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
|
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
|
||||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,11 +2,8 @@
|
||||||
|
|
||||||
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
|
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
|
||||||
|
|
||||||
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
|
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
|
||||||
|
|
||||||
`$ make -j`
|
|
||||||
|
|
||||||
After successful compilation, following usage options are available:
|
|
||||||
```
|
```
|
||||||
usage: ./llama-convert-llama2c-to-ggml [options]
|
usage: ./llama-convert-llama2c-to-ggml [options]
|
||||||
|
|
||||||
|
|
|
@ -840,6 +840,8 @@ class OutputFile:
|
||||||
self.gguf.add_base_model_version(key, base_model_entry["version"])
|
self.gguf.add_base_model_version(key, base_model_entry["version"])
|
||||||
if "organization" in base_model_entry:
|
if "organization" in base_model_entry:
|
||||||
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
|
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
|
||||||
|
if "description" in base_model_entry:
|
||||||
|
self.gguf.add_base_model_description(key, base_model_entry["description"])
|
||||||
if "url" in base_model_entry:
|
if "url" in base_model_entry:
|
||||||
self.gguf.add_base_model_url(key, base_model_entry["url"])
|
self.gguf.add_base_model_url(key, base_model_entry["url"])
|
||||||
if "doi" in base_model_entry:
|
if "doi" in base_model_entry:
|
||||||
|
@ -849,12 +851,32 @@ class OutputFile:
|
||||||
if "repo_url" in base_model_entry:
|
if "repo_url" in base_model_entry:
|
||||||
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
|
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
|
||||||
|
|
||||||
|
if metadata.datasets is not None:
|
||||||
|
self.gguf.add_dataset_count(len(metadata.datasets))
|
||||||
|
for key, dataset_entry in enumerate(metadata.datasets):
|
||||||
|
if "name" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_name(key, dataset_entry["name"])
|
||||||
|
if "author" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_author(key, dataset_entry["author"])
|
||||||
|
if "version" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_version(key, dataset_entry["version"])
|
||||||
|
if "organization" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_organization(key, dataset_entry["organization"])
|
||||||
|
if "description" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_description(key, dataset_entry["description"])
|
||||||
|
if "url" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_url(key, dataset_entry["url"])
|
||||||
|
if "doi" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_doi(key, dataset_entry["doi"])
|
||||||
|
if "uuid" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
|
||||||
|
if "repo_url" in dataset_entry:
|
||||||
|
self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
|
||||||
|
|
||||||
if metadata.tags is not None:
|
if metadata.tags is not None:
|
||||||
self.gguf.add_tags(metadata.tags)
|
self.gguf.add_tags(metadata.tags)
|
||||||
if metadata.languages is not None:
|
if metadata.languages is not None:
|
||||||
self.gguf.add_languages(metadata.languages)
|
self.gguf.add_languages(metadata.languages)
|
||||||
if metadata.datasets is not None:
|
|
||||||
self.gguf.add_datasets(metadata.datasets)
|
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
# Metadata About The Neural Architecture Itself
|
# Metadata About The Neural Architecture Itself
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
|
||||||
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
|
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -12,7 +12,7 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get only the program name from the full path
|
// Get only the program name from the full path
|
||||||
auto pos = filename.find_last_of('/');
|
auto pos = filename.find_last_of("/\\");
|
||||||
if (pos != std::string::npos) {
|
if (pos != std::string::npos) {
|
||||||
filename = filename.substr(pos+1);
|
filename = filename.substr(pos+1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-embedding)
|
||||||
add_executable(${TARGET} embedding.cpp)
|
add_executable(${TARGET} embedding.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,8 +2,9 @@ set(TARGET llama-eval-callback)
|
||||||
add_executable(${TARGET} eval-callback.cpp)
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
set(TEST_TARGET test-eval-callback)
|
set(TEST_TARGET test-eval-callback)
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
add_test(NAME ${TEST_TARGET}
|
||||||
|
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-export-lora)
|
||||||
add_executable(${TARGET} export-lora.cpp)
|
add_executable(${TARGET} export-lora.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator)
|
||||||
add_executable(${TARGET} gbnf-validator.cpp)
|
add_executable(${TARGET} gbnf-validator.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gen-docs)
|
||||||
add_executable(${TARGET} gen-docs.cpp)
|
add_executable(${TARGET} gen-docs.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -4,12 +4,19 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
|
||||||
# clibs dependencies
|
# clibs dependencies
|
||||||
include_directories(deps/)
|
include_directories(deps/)
|
||||||
|
|
||||||
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
|
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
|
||||||
target_link_libraries(${TARGET} PRIVATE xxhash)
|
target_link_libraries(${TARGET} PRIVATE xxhash)
|
||||||
|
|
||||||
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
|
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
|
||||||
target_link_libraries(${TARGET} PRIVATE sha1)
|
target_link_libraries(${TARGET} PRIVATE sha1)
|
||||||
|
if (NOT MSVC)
|
||||||
|
# disable warnings in 3rd party code
|
||||||
|
target_compile_options(sha1 PRIVATE -w)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
|
add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
|
||||||
target_link_libraries(${TARGET} PRIVATE sha256)
|
target_link_libraries(${TARGET} PRIVATE sha256)
|
||||||
|
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gguf-split)
|
||||||
add_executable(${TARGET} gguf-split.cpp)
|
add_executable(${TARGET} gguf-split.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gguf)
|
||||||
add_executable(${TARGET} gguf.cpp)
|
add_executable(${TARGET} gguf.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-gritlm)
|
||||||
add_executable(${TARGET} gritlm.cpp)
|
add_executable(${TARGET} gritlm.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-imatrix)
|
||||||
add_executable(${TARGET} imatrix.cpp)
|
add_executable(${TARGET} imatrix.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -25,8 +25,6 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
## Example
|
## Example
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
GGML_CUDA=1 make -j
|
|
||||||
|
|
||||||
# generate importance matrix (imatrix.dat)
|
# generate importance matrix (imatrix.dat)
|
||||||
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
|
||||||
|
|
|
@ -637,9 +637,18 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.prompt.empty()) {
|
||||||
|
if (params.in_files.empty()) {
|
||||||
|
LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
LOG_INF("No prompt provided; combining precomputed matrices only.\n");
|
||||||
|
} else {
|
||||||
if (!compute_imatrix(ctx, params)) {
|
if (!compute_imatrix(ctx, params)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
|
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-infill)
|
||||||
add_executable(${TARGET} infill.cpp)
|
add_executable(${TARGET} infill.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -14,7 +14,7 @@ In this section, we cover the most commonly used options for running the `infill
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
|
||||||
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||||
|
|
||||||
## Input Prompts
|
## Input Prompts
|
||||||
|
|
|
@ -43,50 +43,6 @@ static std::vector<llama_token> * g_output_tokens;
|
||||||
|
|
||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
|
|
||||||
static void write_logfile(
|
|
||||||
const llama_context * ctx, const common_params & params, const llama_model * model,
|
|
||||||
const std::vector<llama_token> & input_tokens, const std::string & output,
|
|
||||||
const std::vector<llama_token> & output_tokens
|
|
||||||
) {
|
|
||||||
if (params.logdir.empty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string timestamp = string_get_sortable_timestamp();
|
|
||||||
|
|
||||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
||||||
if (!success) {
|
|
||||||
LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
||||||
__func__, params.logdir.c_str());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
|
||||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
||||||
|
|
||||||
if (logfile == NULL) {
|
|
||||||
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(logfile, "binary: infill\n");
|
|
||||||
char model_desc[128];
|
|
||||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
||||||
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
||||||
|
|
||||||
fprintf(logfile, "\n");
|
|
||||||
fprintf(logfile, "######################\n");
|
|
||||||
fprintf(logfile, "# Generation Results #\n");
|
|
||||||
fprintf(logfile, "######################\n");
|
|
||||||
fprintf(logfile, "\n");
|
|
||||||
|
|
||||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
||||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
||||||
|
|
||||||
llama_perf_dump_yaml(logfile, ctx);
|
|
||||||
fclose(logfile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
static void sigint_handler(int signo) {
|
static void sigint_handler(int signo) {
|
||||||
if (signo == SIGINT) {
|
if (signo == SIGINT) {
|
||||||
|
@ -96,7 +52,6 @@ static void sigint_handler(int signo) {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
common_perf_print(*g_ctx, *g_smpl);
|
common_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
|
||||||
|
|
||||||
// make sure all logs are flushed
|
// make sure all logs are flushed
|
||||||
LOG("Interrupted by user\n");
|
LOG("Interrupted by user\n");
|
||||||
|
@ -118,7 +73,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
common_init();
|
common_init();
|
||||||
|
|
||||||
auto & sparams = params.sparams;
|
auto & sparams = params.sampling;
|
||||||
|
|
||||||
console::init(params.simple_io, params.use_color);
|
console::init(params.simple_io, params.use_color);
|
||||||
atexit([]() { console::cleanup(); });
|
atexit([]() { console::cleanup(); });
|
||||||
|
@ -625,7 +580,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
common_perf_print(ctx, smpl);
|
common_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -2,4 +2,4 @@ set(TARGET llama-bench)
|
||||||
add_executable(${TARGET} llama-bench.cpp)
|
add_executable(${TARGET} llama-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue