Compare commits
6 commits
Author | SHA1 | Date | |
---|---|---|---|
|
12aa74ba7d | ||
|
2605c139a6 | ||
|
3e9d3dbff9 | ||
|
6014a63125 | ||
|
927be9b58e | ||
|
284800b1e3 |
1195 changed files with 199369 additions and 314735 deletions
161
.clang-format
161
.clang-format
|
@ -1,161 +0,0 @@
|
||||||
---
|
|
||||||
Language: Cpp
|
|
||||||
AlignAfterOpenBracket: Align
|
|
||||||
AlignArrayOfStructures: Left
|
|
||||||
AlignConsecutiveAssignments: AcrossComments
|
|
||||||
AlignConsecutiveBitFields: AcrossComments
|
|
||||||
AlignConsecutiveDeclarations: AcrossComments
|
|
||||||
AlignConsecutiveMacros: AcrossComments
|
|
||||||
# AlignConsecutiveShortCaseStatements: AcrossComments
|
|
||||||
AlignEscapedNewlines: Left # LeftWithLastLine
|
|
||||||
AlignOperands: Align
|
|
||||||
AlignTrailingComments:
|
|
||||||
Kind: Always
|
|
||||||
OverEmptyLines: 1
|
|
||||||
AllowAllArgumentsOnNextLine: true
|
|
||||||
AllowAllParametersOfDeclarationOnNextLine: false
|
|
||||||
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
|
|
||||||
AllowShortBlocksOnASingleLine: Never
|
|
||||||
AllowShortCaseLabelsOnASingleLine: false
|
|
||||||
AllowShortFunctionsOnASingleLine: Inline
|
|
||||||
AllowShortIfStatementsOnASingleLine: Never
|
|
||||||
AllowShortLambdasOnASingleLine: Inline
|
|
||||||
AllowShortLoopsOnASingleLine: false
|
|
||||||
AlwaysBreakBeforeMultilineStrings: true
|
|
||||||
BinPackArguments: true
|
|
||||||
BinPackParameters: true # OnePerLine
|
|
||||||
BitFieldColonSpacing: Both
|
|
||||||
BreakBeforeBraces: Custom # Attach
|
|
||||||
BraceWrapping:
|
|
||||||
AfterCaseLabel: true
|
|
||||||
AfterClass: false
|
|
||||||
AfterControlStatement: false
|
|
||||||
AfterEnum: false
|
|
||||||
AfterFunction: false
|
|
||||||
AfterNamespace: false
|
|
||||||
AfterObjCDeclaration: false
|
|
||||||
AfterStruct: false
|
|
||||||
AfterUnion: false
|
|
||||||
AfterExternBlock: false
|
|
||||||
BeforeCatch: false
|
|
||||||
BeforeElse: false
|
|
||||||
BeforeLambdaBody: false
|
|
||||||
BeforeWhile: false
|
|
||||||
IndentBraces: false
|
|
||||||
SplitEmptyFunction: false
|
|
||||||
SplitEmptyRecord: false
|
|
||||||
SplitEmptyNamespace: false
|
|
||||||
# BreakAdjacentStringLiterals: true
|
|
||||||
BreakAfterAttributes: Never
|
|
||||||
BreakBeforeBinaryOperators: None
|
|
||||||
BreakBeforeInlineASMColon: OnlyMultiline
|
|
||||||
BreakBeforeTernaryOperators: false
|
|
||||||
# BreakBinaryOperations: Never
|
|
||||||
BreakConstructorInitializers: AfterColon
|
|
||||||
# BreakFunctionDefinitionParameters: false
|
|
||||||
BreakInheritanceList: AfterComma
|
|
||||||
BreakStringLiterals: true
|
|
||||||
# BreakTemplateDeclarations: Yes
|
|
||||||
ColumnLimit: 120
|
|
||||||
CommentPragmas: '^ IWYU pragma:'
|
|
||||||
CompactNamespaces: false
|
|
||||||
ConstructorInitializerIndentWidth: 4
|
|
||||||
ContinuationIndentWidth: 4
|
|
||||||
Cpp11BracedListStyle: false
|
|
||||||
DerivePointerAlignment: false
|
|
||||||
DisableFormat: false
|
|
||||||
EmptyLineBeforeAccessModifier: Leave
|
|
||||||
EmptyLineAfterAccessModifier: Never
|
|
||||||
ExperimentalAutoDetectBinPacking: false
|
|
||||||
FixNamespaceComments: true
|
|
||||||
IncludeBlocks: Regroup
|
|
||||||
IncludeCategories:
|
|
||||||
- Regex: '^<.*\.h>'
|
|
||||||
Priority: 1
|
|
||||||
SortPriority: 0
|
|
||||||
- Regex: '^<.*'
|
|
||||||
Priority: 2
|
|
||||||
SortPriority: 0
|
|
||||||
- Regex: '.*'
|
|
||||||
Priority: 3
|
|
||||||
SortPriority: 0
|
|
||||||
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
|
||||||
IncludeIsMainSourceRegex: ''
|
|
||||||
IndentAccessModifiers: false
|
|
||||||
IndentCaseBlocks: true
|
|
||||||
IndentCaseLabels: true
|
|
||||||
IndentExternBlock: NoIndent
|
|
||||||
IndentGotoLabels: false
|
|
||||||
IndentPPDirectives: AfterHash
|
|
||||||
IndentWidth: 4
|
|
||||||
IndentWrappedFunctionNames: false
|
|
||||||
InsertBraces: true # NOTE: may lead to incorrect formatting
|
|
||||||
InsertNewlineAtEOF: true
|
|
||||||
JavaScriptQuotes: Leave
|
|
||||||
JavaScriptWrapImports: true
|
|
||||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
|
||||||
LambdaBodyIndentation: Signature
|
|
||||||
LineEnding: LF
|
|
||||||
MacroBlockBegin: ''
|
|
||||||
MacroBlockEnd: ''
|
|
||||||
MaxEmptyLinesToKeep: 1
|
|
||||||
NamespaceIndentation: None
|
|
||||||
ObjCBinPackProtocolList: Auto
|
|
||||||
ObjCBlockIndentWidth: 4
|
|
||||||
ObjCSpaceAfterProperty: true
|
|
||||||
ObjCSpaceBeforeProtocolList: true
|
|
||||||
PPIndentWidth: -1
|
|
||||||
PackConstructorInitializers: CurrentLine
|
|
||||||
PenaltyBreakAssignment: 2
|
|
||||||
PenaltyBreakBeforeFirstCallParameter: 1
|
|
||||||
PenaltyBreakComment: 300
|
|
||||||
PenaltyBreakFirstLessLess: 120
|
|
||||||
PenaltyBreakString: 1000
|
|
||||||
PenaltyBreakTemplateDeclaration: 10
|
|
||||||
PenaltyExcessCharacter: 1000000
|
|
||||||
PenaltyReturnTypeOnItsOwnLine: 200
|
|
||||||
PointerAlignment: Middle
|
|
||||||
QualifierAlignment: Left
|
|
||||||
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
|
|
||||||
RawStringFormats:
|
|
||||||
- Language: Cpp
|
|
||||||
Delimiters:
|
|
||||||
- cc
|
|
||||||
- CC
|
|
||||||
- cpp
|
|
||||||
- Cpp
|
|
||||||
- CPP
|
|
||||||
- 'c++'
|
|
||||||
- 'C++'
|
|
||||||
CanonicalDelimiter: ''
|
|
||||||
ReferenceAlignment: Middle
|
|
||||||
ReflowComments: false # IndentOnly
|
|
||||||
SeparateDefinitionBlocks: Always
|
|
||||||
SortIncludes: CaseInsensitive
|
|
||||||
SortUsingDeclarations: LexicographicNumeric
|
|
||||||
SpaceAfterCStyleCast: true
|
|
||||||
SpaceAfterLogicalNot: false
|
|
||||||
SpaceAfterTemplateKeyword: true
|
|
||||||
SpaceBeforeAssignmentOperators: true
|
|
||||||
SpaceBeforeCpp11BracedList: false
|
|
||||||
SpaceBeforeCtorInitializerColon: true
|
|
||||||
SpaceBeforeInheritanceColon: true
|
|
||||||
SpaceBeforeParens: ControlStatements
|
|
||||||
SpaceBeforeRangeBasedForLoopColon: true
|
|
||||||
SpaceInEmptyBlock: false
|
|
||||||
SpaceInEmptyParentheses: false
|
|
||||||
SpacesBeforeTrailingComments: 2
|
|
||||||
SpacesInAngles: Never
|
|
||||||
SpacesInContainerLiterals: true
|
|
||||||
SpacesInLineCommentPrefix:
|
|
||||||
Minimum: 1
|
|
||||||
Maximum: -1
|
|
||||||
SpacesInParentheses: false
|
|
||||||
SpacesInSquareBrackets: false
|
|
||||||
SpaceBeforeSquareBrackets: false
|
|
||||||
Standard: c++17
|
|
||||||
TabWidth: 4
|
|
||||||
UseTab: Never
|
|
||||||
WhitespaceSensitiveMacros: ['STRINGIZE']
|
|
||||||
...
|
|
||||||
|
|
|
@ -12,15 +12,12 @@ Checks: >
|
||||||
-readability-implicit-bool-conversion,
|
-readability-implicit-bool-conversion,
|
||||||
-readability-magic-numbers,
|
-readability-magic-numbers,
|
||||||
-readability-uppercase-literal-suffix,
|
-readability-uppercase-literal-suffix,
|
||||||
-readability-simplify-boolean-expr,
|
|
||||||
clang-analyzer-*,
|
clang-analyzer-*,
|
||||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||||
performance-*,
|
performance-*,
|
||||||
portability-*,
|
portability-*,
|
||||||
-portability-simd-intrinsics,
|
|
||||||
misc-*,
|
misc-*,
|
||||||
-misc-const-correctness,
|
-misc-const-correctness,
|
||||||
-misc-non-private-member-variables-in-classes,
|
-misc-non-private-member-variables-in-classes,
|
||||||
-misc-no-recursion,
|
-misc-no-recursion,
|
||||||
-misc-use-anonymous-namespace,
|
|
||||||
FormatStyle: none
|
FormatStyle: none
|
||||||
|
|
|
@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
||||||
stage('Running llama.cpp'){
|
stage('Running llama.cpp'){
|
||||||
sh'''#!/bin/bash
|
sh'''#!/bin/bash
|
||||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||||
cat llama_log.txt # Printing results
|
cat llama_log.txt # Printing results
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,92 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
ARG TARGETARCH
|
|
||||||
|
|
||||||
ARG GGML_CPU_ARM_ARCH=armv8-a
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
|
||||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
|
||||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
|
||||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
|
||||||
else \
|
|
||||||
echo "Unsupported architecture"; \
|
|
||||||
exit 1; \
|
|
||||||
fi && \
|
|
||||||
cmake --build build -j $(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -1,94 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=12.6.0
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# CUDA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG CUDA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
34
.devops/full-cuda.Dockerfile
Normal file
34
.devops/full-cuda.Dockerfile
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip git
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
45
.devops/full-rocm.Dockerfile
Normal file
45
.devops/full-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV LLAMA_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
22
.devops/full.Dockerfile
Normal file
22
.devops/full.Dockerfile
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip git
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
@ -1,91 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
|
||||||
|
|
||||||
## Build Image
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" \
|
|
||||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with dynamic libs" && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
|
|
||||||
|
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN yum install -y gcc g++ cmake make
|
|
||||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
|
||||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
|
||||||
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
|
|
||||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
|
|
||||||
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
|
||||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
|
||||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
|
|
||||||
# find libascend_hal.so, because the drive hasn`t been mounted.
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
|
||||||
|
|
||||||
RUN echo "Building with static libs" && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
# TODO: use image with NNRT
|
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
|
||||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
|
||||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
|
||||||
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
|
|
||||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
|
|
||||||
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
|
||||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
|
||||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
|
||||||
|
|
||||||
ENTRYPOINT ["/llama-cli" ]
|
|
84
.devops/llama-cpp-clblast.srpm.spec
Normal file
84
.devops/llama-cpp-clblast.srpm.spec
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
# Notes for llama.cpp:
|
||||||
|
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||||
|
# We need to declare standard versioning if people want to sort latest releases.
|
||||||
|
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||||
|
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
||||||
|
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
||||||
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
|
Name: llama.cpp-clblast
|
||||||
|
Version: %( date "+%%Y%%m%%d" )
|
||||||
|
Release: 1%{?dist}
|
||||||
|
Summary: OpenCL Inference of LLaMA model in C/C++
|
||||||
|
License: MIT
|
||||||
|
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||||
|
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
|
||||||
|
Requires: clblast
|
||||||
|
URL: https://github.com/ggerganov/llama.cpp
|
||||||
|
|
||||||
|
%define debug_package %{nil}
|
||||||
|
%define source_date_epoch_from_changelog 0
|
||||||
|
|
||||||
|
%description
|
||||||
|
CPU inference for Meta's Lllama2 models using default options.
|
||||||
|
|
||||||
|
%prep
|
||||||
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
|
%build
|
||||||
|
make -j LLAMA_CLBLAST=1
|
||||||
|
|
||||||
|
%install
|
||||||
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
|
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
||||||
|
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
||||||
|
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
||||||
|
|
||||||
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
||||||
|
[Unit]
|
||||||
|
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||||
|
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
|
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
||||||
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
|
Restart=never
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=default.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
mkdir -p %{buildroot}/etc/sysconfig
|
||||||
|
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
||||||
|
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
%clean
|
||||||
|
rm -rf %{buildroot}
|
||||||
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
|
%files
|
||||||
|
%{_bindir}/llamaclblast
|
||||||
|
%{_bindir}/llamaclblastserver
|
||||||
|
%{_bindir}/llamaclblastsimple
|
||||||
|
/usr/lib/systemd/system/llamaclblast.service
|
||||||
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
||||||
|
%pre
|
||||||
|
|
||||||
|
%post
|
||||||
|
|
||||||
|
%preun
|
||||||
|
%postun
|
||||||
|
|
||||||
|
%changelog
|
|
@ -1,5 +1,5 @@
|
||||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
Name: llama.cpp-cuda
|
Name: llama.cpp-cublas
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
Version: %( date "+%%Y%%m%%d" )
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||||
|
@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options.
|
||||||
%setup -n llama.cpp-master
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
%build
|
%build
|
||||||
make -j GGML_CUDA=1
|
make -j LLAMA_CUBLAS=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
cp -p main %{buildroot}%{_bindir}/llamacppcublas
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,10 +67,10 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cuda-cli
|
%{_bindir}/llamacppcublas
|
||||||
%{_bindir}/llama-cuda-server
|
%{_bindir}/llamacppcublasserver
|
||||||
%{_bindir}/llama-cuda-simple
|
%{_bindir}/llamacppcublassimple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacublas.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
%pre
|
%pre
|
|
@ -1,5 +1,5 @@
|
||||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
||||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
|
@ -38,9 +38,9 @@ make -j
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
cp -p main %{buildroot}%{_bindir}/llama
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
cp -p server %{buildroot}%{_bindir}/llamaserver
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||||
|
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -69,9 +69,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cli
|
%{_bindir}/llama
|
||||||
%{_bindir}/llama-server
|
%{_bindir}/llamaserver
|
||||||
%{_bindir}/llama-simple
|
%{_bindir}/llamasimple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
32
.devops/main-cuda.Dockerfile
Normal file
32
.devops/main-cuda.Dockerfile
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
28
.devops/main-intel.Dockerfile
Normal file
28
.devops/main-intel.Dockerfile
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/main /main
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
45
.devops/main-rocm.Dockerfile
Normal file
45
.devops/main-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV LLAMA_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/main" ]
|
29
.devops/main-vulkan.Dockerfile
Normal file
29
.devops/main-vulkan.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake .. -DLLAMA_VULKAN=1 && \
|
||||||
|
cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
WORKDIR /
|
||||||
|
RUN cp /app/build/bin/main /main && \
|
||||||
|
rm -rf /app
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
20
.devops/main.Dockerfile
Normal file
20
.devops/main.Dockerfile
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
|
@ -1,108 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG MUSA_VERSION=rc3.1.0
|
|
||||||
# Target the MUSA build image
|
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# MUSA architecture to build for (defaults to all supported archs)
|
|
||||||
ARG MUSA_DOCKER_ARCH=default
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y \
|
|
||||||
build-essential \
|
|
||||||
cmake \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
git \
|
|
||||||
libcurl4-openssl-dev \
|
|
||||||
libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Use the default MUSA archs if not specified
|
|
||||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
|
||||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -6,10 +6,11 @@
|
||||||
let
|
let
|
||||||
inherit (config.packages) default;
|
inherit (config.packages) default;
|
||||||
binaries = [
|
binaries = [
|
||||||
"llama-cli"
|
"llama"
|
||||||
"llama-embedding"
|
"llama-embedding"
|
||||||
"llama-server"
|
"llama-server"
|
||||||
"llama-quantize"
|
"quantize"
|
||||||
|
"train-text-from-scratch"
|
||||||
];
|
];
|
||||||
mkApp = name: {
|
mkApp = name: {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
|
|
@ -1,52 +1,13 @@
|
||||||
{ inputs, ... }:
|
|
||||||
|
|
||||||
{
|
{
|
||||||
perSystem =
|
perSystem =
|
||||||
{
|
{ config, lib, ... }:
|
||||||
config,
|
|
||||||
lib,
|
|
||||||
system,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
{
|
{
|
||||||
devShells =
|
devShells =
|
||||||
let
|
lib.concatMapAttrs
|
||||||
pkgs = import inputs.nixpkgs { inherit system; };
|
(name: package: {
|
||||||
stdenv = pkgs.stdenv;
|
${name} = package.passthru.shell;
|
||||||
scripts = config.packages.python-scripts;
|
${name + "-extra"} = package.passthru.shell-extra;
|
||||||
in
|
})
|
||||||
lib.pipe (config.packages) [
|
config.packages;
|
||||||
(lib.concatMapAttrs (
|
|
||||||
name: package: {
|
|
||||||
${name} = pkgs.mkShell {
|
|
||||||
name = "${name}";
|
|
||||||
inputsFrom = [ package ];
|
|
||||||
shellHook = ''
|
|
||||||
echo "Entering ${name} devShell"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
"${name}-extra" =
|
|
||||||
if (name == "python-scripts") then
|
|
||||||
null
|
|
||||||
else
|
|
||||||
pkgs.mkShell {
|
|
||||||
name = "${name}-extra";
|
|
||||||
inputsFrom = [
|
|
||||||
package
|
|
||||||
scripts
|
|
||||||
];
|
|
||||||
# Extra packages that *may* be used by some scripts
|
|
||||||
packages = [
|
|
||||||
pkgs.python3Packages.tiktoken
|
|
||||||
];
|
|
||||||
shellHook = ''
|
|
||||||
echo "Entering ${name} devShell"
|
|
||||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
}
|
|
||||||
))
|
|
||||||
(lib.filterAttrs (name: value: value != null))
|
|
||||||
];
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,14 +26,16 @@
|
||||||
config.cudaSupport = true;
|
config.cudaSupport = true;
|
||||||
config.allowUnfreePredicate =
|
config.allowUnfreePredicate =
|
||||||
p:
|
p:
|
||||||
builtins.all (
|
builtins.all
|
||||||
license:
|
(
|
||||||
license.free
|
license:
|
||||||
|| builtins.elem license.shortName [
|
license.free
|
||||||
"CUDA EULA"
|
|| builtins.elem license.shortName [
|
||||||
"cuDNN EULA"
|
"CUDA EULA"
|
||||||
]
|
"cuDNN EULA"
|
||||||
) (p.meta.licenses or [ p.meta.license ]);
|
]
|
||||||
|
)
|
||||||
|
(p.meta.licenses or [ p.meta.license ]);
|
||||||
};
|
};
|
||||||
# Ensure dependencies use ROCm consistently
|
# Ensure dependencies use ROCm consistently
|
||||||
pkgsRocm = import inputs.nixpkgs {
|
pkgsRocm = import inputs.nixpkgs {
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
llamaVersion,
|
|
||||||
numpy,
|
|
||||||
tqdm,
|
|
||||||
sentencepiece,
|
|
||||||
pyyaml,
|
|
||||||
poetry-core,
|
|
||||||
buildPythonPackage,
|
|
||||||
pytestCheckHook,
|
|
||||||
}:
|
|
||||||
|
|
||||||
buildPythonPackage {
|
|
||||||
pname = "gguf";
|
|
||||||
version = llamaVersion;
|
|
||||||
pyproject = true;
|
|
||||||
nativeBuildInputs = [ poetry-core ];
|
|
||||||
propagatedBuildInputs = [
|
|
||||||
numpy
|
|
||||||
tqdm
|
|
||||||
sentencepiece
|
|
||||||
pyyaml
|
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../gguf-py;
|
|
||||||
pythonImportsCheck = [
|
|
||||||
"numpy"
|
|
||||||
"gguf"
|
|
||||||
];
|
|
||||||
nativeCheckInputs = [ pytestCheckHook ];
|
|
||||||
doCheck = true;
|
|
||||||
meta = with lib; {
|
|
||||||
description = "Python package for writing binary files in the GGUF format";
|
|
||||||
license = licenses.mit;
|
|
||||||
maintainers = [ maintainers.ditsuke ];
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,47 +1,36 @@
|
||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
glibc,
|
|
||||||
config,
|
config,
|
||||||
stdenv,
|
stdenv,
|
||||||
runCommand,
|
mkShell,
|
||||||
cmake,
|
cmake,
|
||||||
ninja,
|
ninja,
|
||||||
pkg-config,
|
pkg-config,
|
||||||
git,
|
git,
|
||||||
|
python3,
|
||||||
mpi,
|
mpi,
|
||||||
blas,
|
openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
|
||||||
cudaPackages,
|
cudaPackages,
|
||||||
autoAddDriverRunpath,
|
|
||||||
darwin,
|
darwin,
|
||||||
rocmPackages,
|
rocmPackages,
|
||||||
vulkan-headers,
|
vulkan-headers,
|
||||||
vulkan-loader,
|
vulkan-loader,
|
||||||
curl,
|
clblast,
|
||||||
shaderc,
|
useBlas ? builtins.all (x: !x) [
|
||||||
useBlas ?
|
useCuda
|
||||||
builtins.all (x: !x) [
|
useMetalKit
|
||||||
useCuda
|
useOpenCL
|
||||||
useMetalKit
|
useRocm
|
||||||
useRocm
|
useVulkan
|
||||||
useVulkan
|
],
|
||||||
]
|
|
||||||
&& blas.meta.available,
|
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||||
# Increases the runtime closure size by ~700M
|
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||||
useMpi ? false,
|
useOpenCL ? false,
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
|
|
||||||
enableCurl ? true,
|
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||||
|
}@inputs:
|
||||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
|
||||||
# otherwise we get libstdc++ errors downstream.
|
|
||||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
|
||||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
|
||||||
precompileMetalShaders ? false,
|
|
||||||
}:
|
|
||||||
|
|
||||||
let
|
let
|
||||||
inherit (lib)
|
inherit (lib)
|
||||||
|
@ -49,29 +38,51 @@ let
|
||||||
cmakeFeature
|
cmakeFeature
|
||||||
optionals
|
optionals
|
||||||
strings
|
strings
|
||||||
|
versionOlder
|
||||||
;
|
;
|
||||||
|
|
||||||
|
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
||||||
|
# otherwise we get libstdc++ errors downstream.
|
||||||
stdenv = throw "Use effectiveStdenv instead";
|
stdenv = throw "Use effectiveStdenv instead";
|
||||||
|
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
|
||||||
|
|
||||||
suffices =
|
suffices =
|
||||||
lib.optionals useBlas [ "BLAS" ]
|
lib.optionals useBlas [ "BLAS" ]
|
||||||
++ lib.optionals useCuda [ "CUDA" ]
|
++ lib.optionals useCuda [ "CUDA" ]
|
||||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||||
++ lib.optionals useMpi [ "MPI" ]
|
++ lib.optionals useMpi [ "MPI" ]
|
||||||
|
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||||
++ lib.optionals useRocm [ "ROCm" ]
|
++ lib.optionals useRocm [ "ROCm" ]
|
||||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||||
|
|
||||||
pnameSuffix =
|
pnameSuffix =
|
||||||
strings.optionalString (suffices != [ ])
|
strings.optionalString (suffices != [ ])
|
||||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
||||||
descriptionSuffix = strings.optionalString (
|
descriptionSuffix =
|
||||||
suffices != [ ]
|
strings.optionalString (suffices != [ ])
|
||||||
) ", accelerated with ${strings.concatStringsSep ", " suffices}";
|
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||||
|
|
||||||
xcrunHost = runCommand "xcrunHost" { } ''
|
# TODO: package the Python in this repository in a Nix-like way.
|
||||||
mkdir -p $out/bin
|
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
||||||
ln -s /usr/bin/xcrun $out/bin
|
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
||||||
'';
|
# https://peps.python.org/pep-0517/
|
||||||
|
llama-python = python3.withPackages (
|
||||||
|
ps: [
|
||||||
|
ps.numpy
|
||||||
|
ps.sentencepiece
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||||
|
llama-python-extra = python3.withPackages (
|
||||||
|
ps: [
|
||||||
|
ps.numpy
|
||||||
|
ps.sentencepiece
|
||||||
|
ps.tiktoken
|
||||||
|
ps.torchWithoutCuda
|
||||||
|
ps.transformers
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
|
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
|
||||||
# separately
|
# separately
|
||||||
|
@ -85,9 +96,16 @@ let
|
||||||
++ optionals useMetalKit [ MetalKit ];
|
++ optionals useMetalKit [ MetalKit ];
|
||||||
|
|
||||||
cudaBuildInputs = with cudaPackages; [
|
cudaBuildInputs = with cudaPackages; [
|
||||||
cuda_cudart
|
cuda_cccl.dev # <nv/target>
|
||||||
cuda_cccl # <nv/target>
|
|
||||||
libcublas
|
# A temporary hack for reducing the closure size, remove once cudaPackages
|
||||||
|
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
||||||
|
cuda_cudart.dev
|
||||||
|
cuda_cudart.lib
|
||||||
|
cuda_cudart.static
|
||||||
|
libcublas.dev
|
||||||
|
libcublas.lib
|
||||||
|
libcublas.static
|
||||||
];
|
];
|
||||||
|
|
||||||
rocmBuildInputs = with rocmPackages; [
|
rocmBuildInputs = with rocmPackages; [
|
||||||
|
@ -99,149 +117,174 @@ let
|
||||||
vulkanBuildInputs = [
|
vulkanBuildInputs = [
|
||||||
vulkan-headers
|
vulkan-headers
|
||||||
vulkan-loader
|
vulkan-loader
|
||||||
shaderc
|
|
||||||
];
|
];
|
||||||
in
|
in
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (finalAttrs: {
|
effectiveStdenv.mkDerivation (
|
||||||
pname = "llama-cpp${pnameSuffix}";
|
finalAttrs: {
|
||||||
version = llamaVersion;
|
pname = "llama-cpp${pnameSuffix}";
|
||||||
|
version = llamaVersion;
|
||||||
|
|
||||||
# Note: none of the files discarded here are visible in the sandbox or
|
# Note: none of the files discarded here are visible in the sandbox or
|
||||||
# affect the output hash. This also means they can be modified without
|
# affect the output hash. This also means they can be modified without
|
||||||
# triggering a rebuild.
|
# triggering a rebuild.
|
||||||
src = lib.cleanSourceWith {
|
src = lib.cleanSourceWith {
|
||||||
filter =
|
filter =
|
||||||
name: type:
|
name: type:
|
||||||
let
|
let
|
||||||
noneOf = builtins.all (x: !x);
|
noneOf = builtins.all (x: !x);
|
||||||
baseName = baseNameOf name;
|
baseName = baseNameOf name;
|
||||||
in
|
in
|
||||||
noneOf [
|
noneOf [
|
||||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
||||||
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
||||||
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
||||||
(baseName == "flake.lock")
|
(baseName == "flake.lock")
|
||||||
|
];
|
||||||
|
src = lib.cleanSource ../../.;
|
||||||
|
};
|
||||||
|
|
||||||
|
postPatch = ''
|
||||||
|
substituteInPlace ./ggml-metal.m \
|
||||||
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
|
|
||||||
|
# TODO: Package up each Python script or service appropriately.
|
||||||
|
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
|
||||||
|
# we could make those *.py into setuptools' entrypoints
|
||||||
|
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
|
||||||
|
'';
|
||||||
|
|
||||||
|
nativeBuildInputs =
|
||||||
|
[
|
||||||
|
cmake
|
||||||
|
ninja
|
||||||
|
pkg-config
|
||||||
|
git
|
||||||
|
]
|
||||||
|
++ optionals useCuda [
|
||||||
|
cudaPackages.cuda_nvcc
|
||||||
|
|
||||||
|
# TODO: Replace with autoAddDriverRunpath
|
||||||
|
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
||||||
|
cudaPackages.autoAddOpenGLRunpathHook
|
||||||
];
|
];
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
|
|
||||||
postPatch = ''
|
buildInputs =
|
||||||
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
++ optionals useCuda cudaBuildInputs
|
||||||
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
|
++ optionals useMpi [ mpi ]
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
++ optionals useOpenCL [ clblast ]
|
||||||
'';
|
++ optionals useRocm rocmBuildInputs
|
||||||
|
++ optionals useVulkan vulkanBuildInputs;
|
||||||
|
|
||||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
cmakeFlags =
|
||||||
# `default.metallib` may be compiled with Metal compiler from XCode
|
[
|
||||||
# and we need to escape sandbox on MacOS to access Metal compiler.
|
(cmakeBool "LLAMA_NATIVE" false)
|
||||||
# `xcrun` is used find the path of the Metal compiler, which is varible
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
# and not on $PATH
|
(cmakeBool "BUILD_SHARED_LIBS" true)
|
||||||
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
(cmakeBool "LLAMA_BLAS" useBlas)
|
||||||
|
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
||||||
nativeBuildInputs =
|
(cmakeBool "LLAMA_CUBLAS" useCuda)
|
||||||
[
|
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
||||||
cmake
|
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||||
ninja
|
(cmakeBool "LLAMA_MPI" useMpi)
|
||||||
pkg-config
|
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
||||||
git
|
]
|
||||||
]
|
++ optionals useCuda [
|
||||||
++ optionals useCuda [
|
(
|
||||||
cudaPackages.cuda_nvcc
|
with cudaPackages.flags;
|
||||||
|
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
||||||
autoAddDriverRunpath
|
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
||||||
]
|
)
|
||||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
|
|
||||||
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
|
|
||||||
|
|
||||||
buildInputs =
|
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
|
||||||
++ optionals useCuda cudaBuildInputs
|
|
||||||
++ optionals useMpi [ mpi ]
|
|
||||||
++ optionals useRocm rocmBuildInputs
|
|
||||||
++ optionals useBlas [ blas ]
|
|
||||||
++ optionals useVulkan vulkanBuildInputs
|
|
||||||
++ optionals enableCurl [ curl ];
|
|
||||||
|
|
||||||
cmakeFlags =
|
|
||||||
[
|
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
|
||||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
|
||||||
(cmakeBool "GGML_HIP" useRocm)
|
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
|
||||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
|
||||||
(cmakeBool "GGML_STATIC" enableStatic)
|
|
||||||
]
|
|
||||||
++ optionals useCuda [
|
|
||||||
(
|
|
||||||
with cudaPackages.flags;
|
|
||||||
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
|
||||||
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
|
||||||
)
|
)
|
||||||
)
|
]
|
||||||
]
|
++ optionals useRocm [
|
||||||
++ optionals useRocm [
|
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
|
||||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
|
||||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
|
|
||||||
]
|
|
||||||
++ optionals useMetalKit [
|
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
|
||||||
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
|
||||||
];
|
|
||||||
|
|
||||||
# Environment variables needed for ROCm
|
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||||
env = optionals useRocm {
|
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||||
ROCM_PATH = "${rocmPackages.clr}";
|
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||||
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
# Should likely use `rocmPackages.clr.gpuTargets`.
|
||||||
};
|
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||||
|
]
|
||||||
|
++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
|
||||||
|
++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
|
||||||
|
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mkdir -p $out/include
|
mv $out/bin/main $out/bin/llama
|
||||||
cp $src/include/llama.h $out/include/
|
mv $out/bin/server $out/bin/llama-server
|
||||||
'';
|
mkdir -p $out/include
|
||||||
|
cp $src/llama.h $out/include/
|
||||||
|
'';
|
||||||
|
|
||||||
meta = {
|
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
passthru = {
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
inherit
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
useBlas
|
||||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
useCuda
|
||||||
|
useMetalKit
|
||||||
|
useMpi
|
||||||
|
useOpenCL
|
||||||
|
useRocm
|
||||||
|
useVulkan
|
||||||
|
;
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
shell = mkShell {
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
name = "shell-${finalAttrs.finalPackage.name}";
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
description = "contains numpy and sentencepiece";
|
||||||
|
buildInputs = [ llama-python ];
|
||||||
|
inputsFrom = [ finalAttrs.finalPackage ];
|
||||||
|
shellHook = ''
|
||||||
|
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
shell-extra = mkShell {
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
||||||
license = lib.licenses.mit;
|
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
||||||
|
buildInputs = [ llama-python-extra ];
|
||||||
|
inputsFrom = [ finalAttrs.finalPackage ];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
meta = {
|
||||||
mainProgram = "llama-cli";
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
|
# cudaPackages would've refused to evaluate anyway.
|
||||||
|
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
# Configurations that are known to result in build failures. Can be
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||||
# Consider adding yourself to this list if you want to ensure this flake
|
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||||
# stays maintained and you're willing to invest your time. Do not add
|
|
||||||
# other people without their consent. Consider removing people after
|
|
||||||
# they've been unreachable for long periods of time.
|
|
||||||
|
|
||||||
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||||
# an attrset following the same format as in
|
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||||
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
license = lib.licenses.mit;
|
||||||
maintainers = with lib.maintainers; [
|
|
||||||
philiptaron
|
|
||||||
SomeoneSerge
|
|
||||||
];
|
|
||||||
|
|
||||||
# Extend `badPlatforms` instead
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
platforms = lib.platforms.all;
|
mainProgram = "llama";
|
||||||
};
|
|
||||||
})
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
|
# Consider adding yourself to this list if you want to ensure this flake
|
||||||
|
# stays maintained and you're willing to invest your time. Do not add
|
||||||
|
# other people without their consent. Consider removing people after
|
||||||
|
# they've been unreachable for long periods of time.
|
||||||
|
|
||||||
|
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
||||||
|
# an attrset following the same format as in
|
||||||
|
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
||||||
|
maintainers = with lib.maintainers; [
|
||||||
|
philiptaron
|
||||||
|
SomeoneSerge
|
||||||
|
];
|
||||||
|
|
||||||
|
# Extend `badPlatforms` instead
|
||||||
|
platforms = lib.platforms.all;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
stdenv,
|
|
||||||
buildPythonPackage,
|
|
||||||
poetry-core,
|
|
||||||
mkShell,
|
|
||||||
python3Packages,
|
|
||||||
gguf-py,
|
|
||||||
}@inputs:
|
|
||||||
|
|
||||||
let
|
|
||||||
llama-python-deps = with python3Packages; [
|
|
||||||
numpy
|
|
||||||
sentencepiece
|
|
||||||
transformers
|
|
||||||
protobuf
|
|
||||||
torchWithoutCuda
|
|
||||||
gguf-py
|
|
||||||
tqdm
|
|
||||||
|
|
||||||
# for scripts/compare-llama-bench.py
|
|
||||||
gitpython
|
|
||||||
tabulate
|
|
||||||
|
|
||||||
# for examples/pydantic-models-to-grammar-examples.py
|
|
||||||
docstring-parser
|
|
||||||
pydantic
|
|
||||||
|
|
||||||
];
|
|
||||||
|
|
||||||
llama-python-test-deps = with python3Packages; [
|
|
||||||
# Server bench
|
|
||||||
matplotlib
|
|
||||||
|
|
||||||
# server tests
|
|
||||||
openai
|
|
||||||
pytest
|
|
||||||
prometheus-client
|
|
||||||
];
|
|
||||||
in
|
|
||||||
|
|
||||||
buildPythonPackage ({
|
|
||||||
pname = "llama-scripts";
|
|
||||||
version = "0.0.0";
|
|
||||||
pyproject = true;
|
|
||||||
|
|
||||||
# NOTE: The files filtered out here are not visible in the build sandbox, neither
|
|
||||||
# do they affect the output hash. They can be modified without triggering a rebuild.
|
|
||||||
src = lib.cleanSourceWith {
|
|
||||||
filter =
|
|
||||||
name: type:
|
|
||||||
let
|
|
||||||
any = builtins.any (x: x);
|
|
||||||
baseName = builtins.baseNameOf name;
|
|
||||||
in
|
|
||||||
any [
|
|
||||||
(lib.hasSuffix ".py" name)
|
|
||||||
(baseName == "README.md")
|
|
||||||
(baseName == "pyproject.toml")
|
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
nativeBuildInputs = [ poetry-core ];
|
|
||||||
nativeCheckInputs = llama-python-test-deps;
|
|
||||||
dependencies = llama-python-deps;
|
|
||||||
})
|
|
|
@ -1,41 +1,19 @@
|
||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
newScope,
|
newScope,
|
||||||
python3,
|
|
||||||
llamaVersion ? "0.0.0",
|
llamaVersion ? "0.0.0",
|
||||||
}:
|
}:
|
||||||
|
|
||||||
let
|
|
||||||
pythonPackages = python3.pkgs;
|
|
||||||
buildPythonPackage = pythonPackages.buildPythonPackage;
|
|
||||||
numpy = pythonPackages.numpy;
|
|
||||||
tqdm = pythonPackages.tqdm;
|
|
||||||
sentencepiece = pythonPackages.sentencepiece;
|
|
||||||
pyyaml = pythonPackages.pyyaml;
|
|
||||||
poetry-core = pythonPackages.poetry-core;
|
|
||||||
pytestCheckHook = pythonPackages.pytestCheckHook;
|
|
||||||
in
|
|
||||||
|
|
||||||
# We're using `makeScope` instead of just writing out an attrset
|
# We're using `makeScope` instead of just writing out an attrset
|
||||||
# because it allows users to apply overlays later using `overrideScope'`.
|
# because it allows users to apply overlays later using `overrideScope'`.
|
||||||
# Cf. https://noogle.dev/f/lib/makeScope
|
# Cf. https://noogle.dev/f/lib/makeScope
|
||||||
|
|
||||||
lib.makeScope newScope (self: {
|
lib.makeScope newScope (
|
||||||
inherit llamaVersion;
|
self: {
|
||||||
gguf-py = self.callPackage ./package-gguf-py.nix {
|
inherit llamaVersion;
|
||||||
inherit
|
llama-cpp = self.callPackage ./package.nix { };
|
||||||
buildPythonPackage
|
docker = self.callPackage ./docker.nix { };
|
||||||
numpy
|
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||||
tqdm
|
sif = self.callPackage ./sif.nix { };
|
||||||
sentencepiece
|
}
|
||||||
poetry-core
|
)
|
||||||
pyyaml
|
|
||||||
pytestCheckHook
|
|
||||||
;
|
|
||||||
};
|
|
||||||
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
|
||||||
docker = self.callPackage ./docker.nix { };
|
|
||||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
|
||||||
sif = self.callPackage ./sif.nix { };
|
|
||||||
})
|
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
}:
|
}:
|
||||||
|
|
||||||
let
|
let
|
||||||
optionalInt = cond: x: if cond then x else 0;
|
optionalInt = cond: x: if cond then x else 0;
|
||||||
in
|
in
|
||||||
singularity-tools.buildImage rec {
|
singularity-tools.buildImage rec {
|
||||||
inherit (llama-cpp) name;
|
inherit (llama-cpp) name;
|
||||||
|
|
|
@ -1,113 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=24.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=6.3
|
|
||||||
ARG AMDGPU_VERSION=6.3
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
### Build image
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
|
|
||||||
# gfx906 is deprecated
|
|
||||||
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
|
|
||||||
|
|
||||||
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
|
|
||||||
ARG ROCM_DOCKER_ARCH=gfx1100
|
|
||||||
|
|
||||||
# Set nvcc architectured
|
|
||||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
# ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
# ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
build-essential \
|
|
||||||
cmake \
|
|
||||||
git \
|
|
||||||
libcurl4-openssl-dev \
|
|
||||||
curl \
|
|
||||||
libgomp1
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
|
||||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
|
||||||
&& cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib \
|
|
||||||
&& find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl\
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3-pip \
|
|
||||||
python3 \
|
|
||||||
python3-wheel\
|
|
||||||
&& pip install --break-system-packages --upgrade setuptools \
|
|
||||||
&& pip install --break-system-packages -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
32
.devops/server-cuda.Dockerfile
Normal file
32
.devops/server-cuda.Dockerfile
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/server /server
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
28
.devops/server-intel.Dockerfile
Normal file
28
.devops/server-intel.Dockerfile
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build . --config Release --target server
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/server /server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
45
.devops/server-rocm.Dockerfile
Normal file
45
.devops/server-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV LLAMA_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/server" ]
|
29
.devops/server-vulkan.Dockerfile
Normal file
29
.devops/server-vulkan.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
# Install build tools
|
||||||
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
|
# Install Vulkan SDK
|
||||||
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
|
apt update -y && \
|
||||||
|
apt-get install -y vulkan-sdk
|
||||||
|
|
||||||
|
# Build it
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake .. -DLLAMA_VULKAN=1 && \
|
||||||
|
cmake --build . --config Release --target server
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
WORKDIR /
|
||||||
|
RUN cp /app/build/bin/server /server && \
|
||||||
|
rm -rf /app
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
20
.devops/server.Dockerfile
Normal file
20
.devops/server.Dockerfile
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/server /server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/server" ]
|
|
@ -8,40 +8,36 @@ arg1="$1"
|
||||||
shift
|
shift
|
||||||
|
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
exec python3 ./convert_hf_to_gguf.py "$@"
|
python3 ./convert.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
exec ./llama-quantize "$@"
|
./quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
exec ./llama-cli "$@"
|
./main "$@"
|
||||||
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||||
exec ./llama-bench "$@"
|
./finetune "$@"
|
||||||
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
|
||||||
exec ./llama-perplexity "$@"
|
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in $(ls $1/$2/ggml-model-f16.bin*); do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
if [ -f "${i/f16/q4_0}" ]; then
|
if [ -f "${i/f16/q4_0}" ]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
./quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
exec ./llama-server "$@"
|
./server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
echo " --run (-r): Run a model previously converted into ggml"
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
|
||||||
echo " ex: -m model.gguf"
|
|
||||||
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
|
||||||
echo " ex: -m model.gguf -f file.txt"
|
|
||||||
echo " --convert (-c): Convert a llama model into ggml"
|
echo " --convert (-c): Convert a llama model into ggml"
|
||||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
|
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
|
||||||
|
echo " See documentation for finetune for command-line parameters"
|
||||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||||
echo " ex: \"/models/\" 7B"
|
echo " ex: \"/models/\" 7B"
|
||||||
echo " --server (-s): Run a model on the server"
|
echo " --server (-s): Run a model on the server"
|
||||||
|
|
|
@ -1,89 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=24.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
|
||||||
|
|
||||||
# Install Vulkan SDK and cURL
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
|
||||||
cmake --build build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
RUN mkdir -p /app/lib && \
|
|
||||||
find build -name "*.so" -exec cp {} /app/lib \;
|
|
||||||
|
|
||||||
RUN mkdir -p /app/full \
|
|
||||||
&& cp build/bin/* /app/full \
|
|
||||||
&& cp *.py /app/full \
|
|
||||||
&& cp -r gguf-py /app/full \
|
|
||||||
&& cp -r requirements /app/full \
|
|
||||||
&& cp requirements.txt /app/full \
|
|
||||||
&& cp .devops/tools.sh /app/full/tools.sh
|
|
||||||
|
|
||||||
## Base image
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y libgomp1 curl libvulkan-dev \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
COPY --from=build /app/lib/ /app
|
|
||||||
|
|
||||||
### Full
|
|
||||||
FROM base AS full
|
|
||||||
|
|
||||||
COPY --from=build /app/full /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y \
|
|
||||||
git \
|
|
||||||
python3 \
|
|
||||||
python3-pip \
|
|
||||||
python3-wheel \
|
|
||||||
&& pip install --break-system-packages --upgrade setuptools \
|
|
||||||
&& pip install --break-system-packages -r requirements.txt \
|
|
||||||
&& apt autoremove -y \
|
|
||||||
&& apt clean -y \
|
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
|
||||||
&& find /var/cache -type f -delete
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
|
||||||
|
|
||||||
### Light, CLI only
|
|
||||||
FROM base AS light
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-cli /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
||||||
|
|
||||||
### Server, Server only
|
|
||||||
FROM base AS server
|
|
||||||
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
COPY --from=build /app/full/llama-server /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -1,7 +1,7 @@
|
||||||
*.o
|
*.o
|
||||||
*.a
|
*.a
|
||||||
.cache/
|
.cache/
|
||||||
# Do not ignore .git directory, otherwise the reported build number will always be 0
|
.git/
|
||||||
.github/
|
.github/
|
||||||
.gitignore
|
.gitignore
|
||||||
.vs/
|
.vs/
|
||||||
|
@ -12,8 +12,8 @@ build*/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
/llama-cli
|
/main
|
||||||
/llama-quantize
|
/quantize
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
2
.ecrc
2
.ecrc
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
|
"Exclude": ["^\\.gitmodules$"],
|
||||||
"Disable": {
|
"Disable": {
|
||||||
"IndentSize": true
|
"IndentSize": true
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,27 +24,5 @@ insert_final_newline = unset
|
||||||
[examples/server/public/*]
|
[examples/server/public/*]
|
||||||
indent_size = 2
|
indent_size = 2
|
||||||
|
|
||||||
[examples/server/public/deps_*]
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
|
|
||||||
[examples/server/deps_*]
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
[examples/cvector-generator/*.txt]
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
insert_final_newline = unset
|
|
||||||
|
|
||||||
[models/templates/*.jinja]
|
|
||||||
indent_style = unset
|
|
||||||
indent_size = unset
|
|
||||||
end_of_line = unset
|
|
||||||
charset = unset
|
|
||||||
trim_trailing_whitespace = unset
|
|
||||||
insert_final_newline = unset
|
|
||||||
|
|
16
.flake8
16
.flake8
|
@ -1,17 +1,3 @@
|
||||||
[flake8]
|
[flake8]
|
||||||
max-line-length = 125
|
max-line-length = 125
|
||||||
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
ignore = W503
|
||||||
exclude =
|
|
||||||
# Do not traverse examples
|
|
||||||
examples,
|
|
||||||
# Do not include package initializers
|
|
||||||
__init__.py,
|
|
||||||
# No need to traverse our git directory
|
|
||||||
.git,
|
|
||||||
# There's no value in checking cache directories
|
|
||||||
__pycache__,
|
|
||||||
# No need to include the build path
|
|
||||||
build,
|
|
||||||
# This contains builds that we don't want to check
|
|
||||||
dist # This is generated with `python build .` for package releases
|
|
||||||
# max-complexity = 10
|
|
||||||
|
|
87
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
87
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
|
@ -1,87 +0,0 @@
|
||||||
name: Bug (compilation)
|
|
||||||
description: Something goes wrong when trying to compile llama.cpp.
|
|
||||||
title: "Compile bug: "
|
|
||||||
labels: ["bug-unconfirmed", "compilation"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for bug reports where the compilation of llama.cpp fails.
|
|
||||||
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
|
|
||||||
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
|
||||||
by clearing `~/.cache/ccache` (on Linux).
|
|
||||||
- type: textarea
|
|
||||||
id: commit
|
|
||||||
attributes:
|
|
||||||
label: Git commit
|
|
||||||
description: Which commit are you trying to compile?
|
|
||||||
placeholder: |
|
|
||||||
$git rev-parse HEAD
|
|
||||||
84a07a17b1b08cf2b9747c633a2372782848a27f
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: backends
|
|
||||||
attributes:
|
|
||||||
label: GGML backends
|
|
||||||
description: Which GGML backends do you know to be affected?
|
|
||||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
|
||||||
multiple: true
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
|
||||||
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
|
|
||||||
placeholder: >
|
|
||||||
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
|
|
||||||
Here are the exact commands that I used: ...
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: command
|
|
||||||
attributes:
|
|
||||||
label: Compile command
|
|
||||||
description: >
|
|
||||||
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
Please copy and paste any relevant log output, including any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
|
@ -1,101 +0,0 @@
|
||||||
name: Bug (model use)
|
|
||||||
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
|
|
||||||
title: "Eval bug: "
|
|
||||||
labels: ["bug-unconfirmed", "model evaluation"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for bug reports where the model evaluation results
|
|
||||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
|
||||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: backends
|
|
||||||
attributes:
|
|
||||||
label: GGML backends
|
|
||||||
description: Which GGML backends do you know to be affected?
|
|
||||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
|
||||||
multiple: true
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: hardware
|
|
||||||
attributes:
|
|
||||||
label: Hardware
|
|
||||||
description: Which CPUs/GPUs are you using?
|
|
||||||
placeholder: >
|
|
||||||
e.g. Ryzen 5950X + 2x RTX 4090
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: model
|
|
||||||
attributes:
|
|
||||||
label: Models
|
|
||||||
description: >
|
|
||||||
Which model(s) at which quantization were you using when encountering the bug?
|
|
||||||
If you downloaded a GGUF file off of Huggingface, please provide a link.
|
|
||||||
placeholder: >
|
|
||||||
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
|
||||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
|
||||||
that information would be very much appreciated by us.
|
|
||||||
placeholder: >
|
|
||||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
|
||||||
When I use -ngl 0 it works correctly.
|
|
||||||
Here are the exact commands that I used: ...
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: true
|
|
91
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
91
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
|
@ -1,91 +0,0 @@
|
||||||
name: Bug (misc.)
|
|
||||||
description: Something is not working the way it should (and it's not covered by any of the above cases).
|
|
||||||
title: "Misc. bug: "
|
|
||||||
labels: ["bug-unconfirmed"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
This issue template is intended for miscellaneous bugs that don't fit into any other category.
|
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which version of our software is affected? (You can use `--version` to get a version string.)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: Operating systems
|
|
||||||
description: Which operating systems do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: dropdown
|
|
||||||
id: module
|
|
||||||
attributes:
|
|
||||||
label: Which llama.cpp modules do you know to be affected?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Documentation/Github
|
|
||||||
- libllama (core library)
|
|
||||||
- llama-cli
|
|
||||||
- llama-server
|
|
||||||
- llama-bench
|
|
||||||
- llama-quantize
|
|
||||||
- Python/Bash scripts
|
|
||||||
- Test code
|
|
||||||
- Other (Please specify in the next section)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: command
|
|
||||||
attributes:
|
|
||||||
label: Command line
|
|
||||||
description: >
|
|
||||||
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: info
|
|
||||||
attributes:
|
|
||||||
label: Problem description & steps to reproduce
|
|
||||||
description: >
|
|
||||||
Please give us a summary of the problem and tell us how to reproduce it (if applicable).
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: first_bad_commit
|
|
||||||
attributes:
|
|
||||||
label: First Bad Commit
|
|
||||||
description: >
|
|
||||||
If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
|
|
||||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: >
|
|
||||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
|
||||||
This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
||||||
validations:
|
|
||||||
required: false
|
|
51
.github/ISSUE_TEMPLATE/020-enhancement.yml
vendored
51
.github/ISSUE_TEMPLATE/020-enhancement.yml
vendored
|
@ -1,51 +0,0 @@
|
||||||
name: Enhancement
|
|
||||||
description: Used to request enhancements for llama.cpp.
|
|
||||||
title: "Feature Request: "
|
|
||||||
labels: ["enhancement"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
|
||||||
|
|
||||||
- type: checkboxes
|
|
||||||
id: prerequisites
|
|
||||||
attributes:
|
|
||||||
label: Prerequisites
|
|
||||||
description: Please confirm the following before submitting your enhancement request.
|
|
||||||
options:
|
|
||||||
- label: I am running the latest code. Mention the version if possible as well.
|
|
||||||
required: true
|
|
||||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
|
||||||
required: true
|
|
||||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
|
||||||
required: true
|
|
||||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: feature-description
|
|
||||||
attributes:
|
|
||||||
label: Feature Description
|
|
||||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
|
||||||
placeholder: Detailed description of the enhancement
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: motivation
|
|
||||||
attributes:
|
|
||||||
label: Motivation
|
|
||||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
|
||||||
placeholder: Explanation of why this feature is needed and its benefits
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: possible-implementation
|
|
||||||
attributes:
|
|
||||||
label: Possible Implementation
|
|
||||||
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
|
||||||
placeholder: Detailed description of potential implementation
|
|
||||||
validations:
|
|
||||||
required: false
|
|
52
.github/ISSUE_TEMPLATE/030-research.yml
vendored
52
.github/ISSUE_TEMPLATE/030-research.yml
vendored
|
@ -1,52 +0,0 @@
|
||||||
name: Research
|
|
||||||
description: Track new technical research area.
|
|
||||||
title: "Research: "
|
|
||||||
labels: ["research 🔬"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
|
||||||
|
|
||||||
- type: checkboxes
|
|
||||||
id: research-stage
|
|
||||||
attributes:
|
|
||||||
label: Research Stage
|
|
||||||
description: Track general state of this research ticket
|
|
||||||
options:
|
|
||||||
- label: Background Research (Let's try to avoid reinventing the wheel)
|
|
||||||
- label: Hypothesis Formed (How do you think this will work and it's effect?)
|
|
||||||
- label: Strategy / Implementation Forming
|
|
||||||
- label: Analysis of results
|
|
||||||
- label: Debrief / Documentation (So people in the future can learn from us)
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: background
|
|
||||||
attributes:
|
|
||||||
label: Previous existing literature and research
|
|
||||||
description: Whats the current state of the art and whats the motivation for this research?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: hypothesis
|
|
||||||
attributes:
|
|
||||||
label: Hypothesis
|
|
||||||
description: How do you think this will work and it's effect?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: implementation
|
|
||||||
attributes:
|
|
||||||
label: Implementation
|
|
||||||
description: Got an approach? e.g. a PR ready to go?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: analysis
|
|
||||||
attributes:
|
|
||||||
label: Analysis
|
|
||||||
description: How does the proposed implementation behave?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
28
.github/ISSUE_TEMPLATE/040-refactor.yml
vendored
28
.github/ISSUE_TEMPLATE/040-refactor.yml
vendored
|
@ -1,28 +0,0 @@
|
||||||
name: Refactor (Maintainers)
|
|
||||||
description: Used to track refactoring opportunities.
|
|
||||||
title: "Refactor: "
|
|
||||||
labels: ["refactor"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
|
||||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: background-description
|
|
||||||
attributes:
|
|
||||||
label: Background Description
|
|
||||||
description: Please provide a detailed written description of the pain points you are trying to solve.
|
|
||||||
placeholder: Detailed description behind your motivation to request refactor
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: possible-approaches
|
|
||||||
attributes:
|
|
||||||
label: Possible Refactor Approaches
|
|
||||||
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
|
|
||||||
placeholder: Your idea of possible refactoring opportunity/approaches
|
|
||||||
validations:
|
|
||||||
required: false
|
|
11
.github/ISSUE_TEMPLATE/bug.md
vendored
Normal file
11
.github/ISSUE_TEMPLATE/bug.md
vendored
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
---
|
||||||
|
name: Bug template
|
||||||
|
about: Used to report bugs in llama.cpp
|
||||||
|
labels: ["bug-unconfirmed"]
|
||||||
|
assignees: ''
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
|
||||||
|
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
11
.github/ISSUE_TEMPLATE/config.yml
vendored
11
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,11 +0,0 @@
|
||||||
blank_issues_enabled: true
|
|
||||||
contact_links:
|
|
||||||
- name: Got an idea?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
|
||||||
about: Pop it there. It may then become an enhancement ticket.
|
|
||||||
- name: Got a question?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
|
||||||
about: Ask a question there!
|
|
||||||
- name: Want to contribute?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
|
||||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
|
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
Normal file
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
---
|
||||||
|
name: Enhancement template
|
||||||
|
about: Used to request enhancements for llama.cpp
|
||||||
|
labels: ["enhancement"]
|
||||||
|
assignees: ''
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Prerequisites
|
||||||
|
|
||||||
|
Please answer the following questions for yourself before submitting an issue.
|
||||||
|
|
||||||
|
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
||||||
|
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||||
|
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
||||||
|
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
||||||
|
|
||||||
|
# Feature Description
|
||||||
|
|
||||||
|
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||||
|
|
||||||
|
# Motivation
|
||||||
|
|
||||||
|
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||||
|
|
||||||
|
# Possible Implementation
|
||||||
|
|
||||||
|
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
86
.github/labeler.yml
vendored
86
.github/labeler.yml
vendored
|
@ -1,86 +0,0 @@
|
||||||
# https://github.com/actions/labeler
|
|
||||||
Kompute:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-kompute.h
|
|
||||||
- ggml/src/ggml-kompute/**
|
|
||||||
- README-kompute.md
|
|
||||||
Apple Metal:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-metal.h
|
|
||||||
- ggml/src/ggml-metal/**
|
|
||||||
- README-metal.md
|
|
||||||
SYCL:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-sycl.h
|
|
||||||
- ggml/src/ggml-sycl/**
|
|
||||||
- docs/backend/SYCL.md
|
|
||||||
- examples/sycl/**
|
|
||||||
Nvidia GPU:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-cuda.h
|
|
||||||
- ggml/src/ggml-cuda/**
|
|
||||||
Vulkan:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-vulkan.h
|
|
||||||
- ggml/src/ggml-vulkan/**
|
|
||||||
documentation:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- docs/**
|
|
||||||
- media/**
|
|
||||||
testing:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- tests/**
|
|
||||||
build:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- cmake/**
|
|
||||||
- CMakeLists.txt
|
|
||||||
- CMakePresets.json
|
|
||||||
examples:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file: examples/**
|
|
||||||
devops:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- .devops/**
|
|
||||||
- .github/**
|
|
||||||
- ci/**
|
|
||||||
python:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- "**/*.py"
|
|
||||||
- requirements/**
|
|
||||||
- gguf-py/**
|
|
||||||
- .flake8
|
|
||||||
script:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- scripts/**
|
|
||||||
android:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- examples/llama.android/**
|
|
||||||
server:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- examples/server/**
|
|
||||||
ggml:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/**
|
|
||||||
nix:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- "**/*.nix"
|
|
||||||
- .github/workflows/nix-*.yml
|
|
||||||
- .devops/nix/nixpkgs-instances.nix
|
|
||||||
embedding:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file: examples/embedding/
|
|
1
.github/pull_request_template.md
vendored
1
.github/pull_request_template.md
vendored
|
@ -1 +0,0 @@
|
||||||
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
|
|
315
.github/workflows/bench.yml.disabled
vendored
315
.github/workflows/bench.yml.disabled
vendored
|
@ -1,315 +0,0 @@
|
||||||
# TODO: there have been some issues with the workflow, so disabling for now
|
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/7893
|
|
||||||
#
|
|
||||||
# Benchmark
|
|
||||||
name: Benchmark
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
gpu-series:
|
|
||||||
description: 'Azure GPU series to run with'
|
|
||||||
required: true
|
|
||||||
type: choice
|
|
||||||
options:
|
|
||||||
- Standard_NC4as_T4_v3
|
|
||||||
- Standard_NC24ads_A100_v4
|
|
||||||
- Standard_NC80adis_H100_v5
|
|
||||||
sha:
|
|
||||||
description: 'Commit SHA1 to build'
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
duration:
|
|
||||||
description: 'Duration of the bench'
|
|
||||||
type: string
|
|
||||||
default: 10m
|
|
||||||
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
|
||||||
pull_request_target:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
|
||||||
schedule:
|
|
||||||
- cron: '04 2 * * *'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
bench-server-baseline:
|
|
||||||
runs-on: Standard_NC4as_T4_v3
|
|
||||||
env:
|
|
||||||
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
|
|
||||||
N_USERS: 8
|
|
||||||
DURATION: 10m
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
model: [phi-2]
|
|
||||||
ftype: [q4_0, q8_0, f16]
|
|
||||||
include:
|
|
||||||
- model: phi-2
|
|
||||||
ftype: q4_0
|
|
||||||
pr_comment_enabled: "true"
|
|
||||||
|
|
||||||
if: |
|
|
||||||
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|
|
||||||
|| (
|
|
||||||
github.event_name == 'schedule'
|
|
||||||
&& github.ref_name == 'master'
|
|
||||||
&& github.repository_owner == 'ggerganov'
|
|
||||||
)
|
|
||||||
|| github.event_name == 'pull_request_target'
|
|
||||||
|| (
|
|
||||||
github.event_name == 'push'
|
|
||||||
&& github.event.ref == 'refs/heads/master'
|
|
||||||
&& github.repository_owner == 'ggerganov'
|
|
||||||
)
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Install python env
|
|
||||||
id: pipenv
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
python3 -m venv venv
|
|
||||||
source venv/bin/activate
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
- name: Prometheus
|
|
||||||
id: install_prometheus
|
|
||||||
run: |
|
|
||||||
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
|
||||||
tar xzf prometheus*.tar.gz --strip-components=1
|
|
||||||
./prometheus --config.file=examples/server/bench/prometheus.yml &
|
|
||||||
while ! nc -z localhost 9090; do
|
|
||||||
sleep 0.1
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Set up Go
|
|
||||||
uses: actions/setup-go@v5
|
|
||||||
with:
|
|
||||||
go-version: '1.21'
|
|
||||||
|
|
||||||
- name: Install k6 and xk6-sse
|
|
||||||
id: k6_installation
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
go install go.k6.io/xk6/cmd/xk6@latest
|
|
||||||
xk6 build master \
|
|
||||||
--with github.com/phymbert/xk6-sse
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: cmake_build
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DLLAMA_CUBLAS=ON \
|
|
||||||
-DCUDAToolkit_ROOT=/usr/local/cuda \
|
|
||||||
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
|
|
||||||
-DCMAKE_CUDA_ARCHITECTURES=75 \
|
|
||||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
|
||||||
-DLLAMA_ALL_WARNINGS=OFF \
|
|
||||||
-DCMAKE_BUILD_TYPE=Release;
|
|
||||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Download the dataset
|
|
||||||
id: download_dataset
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
- name: Server bench
|
|
||||||
id: server_bench
|
|
||||||
env:
|
|
||||||
HEAD_REF: ${{ github.head_ref || github.ref_name }}
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
cd examples/server/bench
|
|
||||||
source venv/bin/activate
|
|
||||||
python bench.py \
|
|
||||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
|
||||||
--name ${{ github.job }} \
|
|
||||||
--branch $HEAD_REF \
|
|
||||||
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
|
||||||
--scenario script.js \
|
|
||||||
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
|
||||||
--hf-repo ggml-org/models \
|
|
||||||
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
|
|
||||||
--model-path-prefix /models \
|
|
||||||
--parallel ${{ env.N_USERS }} \
|
|
||||||
-ngl 33 \
|
|
||||||
--batch-size 2048 \
|
|
||||||
--ubatch-size 256 \
|
|
||||||
--ctx-size 16384 \
|
|
||||||
--n-prompts 1000 \
|
|
||||||
--max-prompt-tokens 1024 \
|
|
||||||
--max-tokens 2048
|
|
||||||
|
|
||||||
cat results.github.env >> $GITHUB_ENV
|
|
||||||
|
|
||||||
# Remove dataset as we do not want it in the artefact
|
|
||||||
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
compression-level: 9
|
|
||||||
path: |
|
|
||||||
examples/server/bench/*.jpg
|
|
||||||
examples/server/bench/*.json
|
|
||||||
examples/server/bench/*.log
|
|
||||||
|
|
||||||
- name: Commit status
|
|
||||||
uses: Sibz/github-status-action@v1
|
|
||||||
with:
|
|
||||||
authToken: ${{secrets.GITHUB_TOKEN}}
|
|
||||||
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
|
|
||||||
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
description: |
|
|
||||||
${{ env.BENCH_RESULTS }}
|
|
||||||
state: 'success'
|
|
||||||
|
|
||||||
- name: Upload benchmark images
|
|
||||||
uses: devicons/public-upload-to-imgur@v2.2.2
|
|
||||||
continue-on-error: true # Important as it looks unstable: 503
|
|
||||||
id: imgur_step
|
|
||||||
with:
|
|
||||||
client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
|
||||||
path: |
|
|
||||||
examples/server/bench/prompt_tokens_seconds.jpg
|
|
||||||
examples/server/bench/predicted_tokens_seconds.jpg
|
|
||||||
examples/server/bench/kv_cache_usage_ratio.jpg
|
|
||||||
examples/server/bench/requests_processing.jpg
|
|
||||||
|
|
||||||
- name: Extract mermaid
|
|
||||||
id: set_mermaid
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
cd examples/server/bench
|
|
||||||
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
|
||||||
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
|
|
||||||
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
|
|
||||||
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
|
|
||||||
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Extract image url
|
|
||||||
id: extract_image_url
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Comment PR
|
|
||||||
uses: mshick/add-pr-comment@v2
|
|
||||||
id: comment_pr
|
|
||||||
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
|
|
||||||
with:
|
|
||||||
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
message: |
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
|
||||||
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>Expand details for performance related PR only</summary>
|
|
||||||
|
|
||||||
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
|
||||||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
|
||||||
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
|
||||||
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
|
|
||||||
- ${{ env.BENCH_GRAPH_XLABEL }}
|
|
||||||
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.PROMPT_TOKENS_SECONDS }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.PREDICTED_TOKENS_SECONDS }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>Details</summary>
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.KV_CACHE_USAGE_RATIO }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.REQUESTS_PROCESSING }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
</p>
|
|
||||||
</details>
|
|
||||||
</details>
|
|
1271
.github/workflows/build.yml
vendored
1271
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load diff
28
.github/workflows/close-issue.yml
vendored
28
.github/workflows/close-issue.yml
vendored
|
@ -1,28 +0,0 @@
|
||||||
name: Close inactive issues
|
|
||||||
on:
|
|
||||||
schedule:
|
|
||||||
- cron: "42 0 * * *"
|
|
||||||
|
|
||||||
# Fine-grant permission
|
|
||||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
||||||
permissions:
|
|
||||||
issues: write
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
close-issues:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
issues: write
|
|
||||||
pull-requests: write
|
|
||||||
steps:
|
|
||||||
- uses: actions/stale@v5
|
|
||||||
with:
|
|
||||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
|
|
||||||
days-before-issue-stale: 30
|
|
||||||
days-before-issue-close: 14
|
|
||||||
stale-issue-label: "stale"
|
|
||||||
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
|
||||||
days-before-pr-stale: -1
|
|
||||||
days-before-pr-close: -1
|
|
||||||
operations-per-run: 10000
|
|
||||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
|
36
.github/workflows/code-coverage.yml
vendored
Normal file
36
.github/workflows/code-coverage.yml
vendored
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
name: Code Coverage
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run:
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential gcc-8 lcov
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: CC=gcc-8 make test
|
||||||
|
|
||||||
|
- name: Generate coverage report
|
||||||
|
run: |
|
||||||
|
make coverage
|
||||||
|
make lcov-report
|
||||||
|
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v3
|
||||||
|
env:
|
||||||
|
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
with:
|
||||||
|
files: lcov-report/coverage.info
|
156
.github/workflows/docker.yml
vendored
156
.github/workflows/docker.yml
vendored
|
@ -10,50 +10,45 @@
|
||||||
name: Publish Docker image
|
name: Publish Docker image
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # allows manual triggering
|
pull_request:
|
||||||
schedule:
|
push:
|
||||||
# Rebuild daily rather than on every push because it is expensive
|
branches:
|
||||||
- cron: '12 4 * * *'
|
- master
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
# Fine-grant permission
|
|
||||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
||||||
permissions:
|
|
||||||
packages: write
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
push_to_registry:
|
push_to_registry:
|
||||||
name: Push Docker image to Docker Hub
|
name: Push Docker image to Docker Hub
|
||||||
|
if: github.event.pull_request.draft == false
|
||||||
|
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
COMMIT_SHA: ${{ github.sha }}
|
COMMIT_SHA: ${{ github.sha }}
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
# Multi-stage build
|
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
# have disabled them for now until the reason why
|
||||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
|
# is understood.
|
||||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
with:
|
|
||||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
- name: Log in to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
|
@ -62,45 +57,9 @@ jobs:
|
||||||
username: ${{ github.repository_owner }}
|
username: ${{ github.repository_owner }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Determine tag name
|
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
||||||
id: tag
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
|
||||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
|
||||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
|
||||||
REPO_NAME="${{ github.event.repository.name }}"
|
|
||||||
|
|
||||||
# determine tag name postfix (build number, commit hash)
|
|
||||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
|
||||||
TAG_POSTFIX="-b${BUILD_NUMBER}"
|
|
||||||
else
|
|
||||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
|
||||||
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
|
||||||
fi
|
|
||||||
# list all tags possible
|
|
||||||
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
|
||||||
TYPE=""
|
|
||||||
else
|
|
||||||
TYPE="-${{ matrix.config.tag }}"
|
|
||||||
fi
|
|
||||||
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
|
||||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
|
||||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
|
||||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
|
||||||
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
|
||||||
echo "full_output_tags=$FULLTAGS" # print out for debugging
|
|
||||||
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
|
||||||
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
|
||||||
env:
|
|
||||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
|
||||||
|
|
||||||
- name: Free Disk Space (Ubuntu)
|
- name: Free Disk Space (Ubuntu)
|
||||||
if: ${{ matrix.config.free_disk_space == true }}
|
uses: jlumbroso/free-disk-space@main
|
||||||
uses: ggml-org/free-disk-space@v1.3.1
|
|
||||||
with:
|
with:
|
||||||
# this might remove tools that are actually needed,
|
# this might remove tools that are actually needed,
|
||||||
# if set to "true" but frees about 6 GB
|
# if set to "true" but frees about 6 GB
|
||||||
|
@ -115,59 +74,34 @@ jobs:
|
||||||
docker-images: true
|
docker-images: true
|
||||||
swap-storage: true
|
swap-storage: true
|
||||||
|
|
||||||
- name: Build and push Full Docker image (tagged + versioned)
|
- name: Determine tag name
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
id: tag
|
||||||
uses: docker/build-push-action@v6
|
shell: bash
|
||||||
with:
|
run: |
|
||||||
context: .
|
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||||
push: true
|
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||||
platforms: ${{ matrix.config.platforms }}
|
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||||
# tag list is generated from step above
|
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||||
tags: ${{ steps.tag.outputs.full_output_tags }}
|
else
|
||||||
file: ${{ matrix.config.dockerfile }}
|
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||||
target: full
|
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||||
provenance: false
|
fi
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
||||||
- name: Build and push Light Docker image (tagged + versioned)
|
- name: Build and push Docker image (versioned)
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
if: github.event_name == 'push'
|
||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
# tag list is generated from step above
|
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
tags: ${{ steps.tag.outputs.light_output_tags }}
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
target: light
|
|
||||||
provenance: false
|
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
||||||
- name: Build and push Server Docker image (tagged + versioned)
|
- name: Build and push Docker image (tagged)
|
||||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
uses: docker/build-push-action@v4
|
||||||
uses: docker/build-push-action@v6
|
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
# tag list is generated from step above
|
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
tags: ${{ steps.tag.outputs.server_output_tags }}
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
target: server
|
|
||||||
provenance: false
|
|
||||||
# using github experimental cache
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
# return to this if the experimental github cache is having issues
|
|
||||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
|
||||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
|
||||||
|
|
10
.github/workflows/editorconfig.yml
vendored
10
.github/workflows/editorconfig.yml
vendored
|
@ -14,16 +14,10 @@ on:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
editorconfig:
|
editorconfig:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v3
|
||||||
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
- uses: editorconfig-checker/action-editorconfig-checker@main
|
||||||
with:
|
|
||||||
version: v3.0.3
|
|
||||||
- run: editorconfig-checker
|
- run: editorconfig-checker
|
||||||
|
|
4
.github/workflows/gguf-publish.yml
vendored
4
.github/workflows/gguf-publish.yml
vendored
|
@ -24,9 +24,9 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
python-version: '3.9.x'
|
python-version: '3.9.x'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|
17
.github/workflows/labeler.yml
vendored
17
.github/workflows/labeler.yml
vendored
|
@ -1,17 +0,0 @@
|
||||||
name: "Pull Request Labeler"
|
|
||||||
on:
|
|
||||||
- pull_request_target
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
labeler:
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
pull-requests: write
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
repository: "ggerganov/llama.cpp"
|
|
||||||
- uses: actions/labeler@v5
|
|
||||||
with:
|
|
||||||
configuration-path: '.github/labeler.yml'
|
|
61
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
61
.github/workflows/nix-ci-aarch64.yml
vendored
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
name: Nix aarch64 builds
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
schedule:
|
||||||
|
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
||||||
|
# 1.5h instead of minutes with the cold cache).
|
||||||
|
#
|
||||||
|
# randint(0, 59), randint(0, 23)
|
||||||
|
- cron: '26 12 * * *'
|
||||||
|
# But also rebuild if we touched any of the Nix expressions:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: ['**/*.nix', 'flake.lock']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: ['**/*.nix', 'flake.lock']
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
nix-build-aarch64:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install QEMU
|
||||||
|
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
||||||
|
sudo usermod -a -G kvm $USER
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-platforms = aarch64-linux
|
||||||
|
extra-system-features = nixos-test kvm
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: Set-up cachix to push the results to
|
||||||
|
uses: cachix/cachix-action@v13
|
||||||
|
with:
|
||||||
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
|
name: llama-cpp
|
||||||
|
- name: Show all output paths
|
||||||
|
run: >
|
||||||
|
nix run github:nix-community/nix-eval-jobs
|
||||||
|
-- --gc-roots-dir gcroot
|
||||||
|
--flake
|
||||||
|
".#packages.aarch64-linux"
|
||||||
|
- name: Build
|
||||||
|
run: >
|
||||||
|
nix run github:Mic92/nix-fast-build
|
||||||
|
-- --skip-cached --no-nom
|
||||||
|
--systems aarch64-linux
|
||||||
|
--flake
|
||||||
|
".#checks.aarch64-linux"
|
68
.github/workflows/nix-ci.yml
vendored
Normal file
68
.github/workflows/nix-ci.yml
vendored
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
name: Nix CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
nix-eval:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ ubuntu-latest, macos-latest ]
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: List all flake outputs
|
||||||
|
run: nix flake show --all-systems
|
||||||
|
- name: Show all output paths
|
||||||
|
run: >
|
||||||
|
nix run github:nix-community/nix-eval-jobs
|
||||||
|
-- --gc-roots-dir gcroot
|
||||||
|
--flake
|
||||||
|
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||||
|
nix-build:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ ubuntu-latest, macos-latest ]
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@v9
|
||||||
|
with:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
extra-conf: |
|
||||||
|
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
|
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
|
with:
|
||||||
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
- name: Set-up cachix to push the results to
|
||||||
|
uses: cachix/cachix-action@v13
|
||||||
|
with:
|
||||||
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
|
name: llama-cpp
|
||||||
|
- name: Build
|
||||||
|
run: >
|
||||||
|
nix run github:Mic92/nix-fast-build
|
||||||
|
-- --skip-cached --no-nom
|
||||||
|
--flake
|
||||||
|
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
22
.github/workflows/nix-flake-update.yml
vendored
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
name: update-flake-lock
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lockfile:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Nix
|
||||||
|
uses: DeterminateSystems/nix-installer-action@main
|
||||||
|
- name: Update flake.lock
|
||||||
|
uses: DeterminateSystems/update-flake-lock@main
|
||||||
|
with:
|
||||||
|
pr-title: "nix: update flake.lock"
|
||||||
|
pr-labels: |
|
||||||
|
nix
|
||||||
|
pr-reviewers: philiptaron,SomeoneSerge
|
||||||
|
token: ${{ secrets.FLAKE_TOKEN }}
|
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
36
.github/workflows/nix-publish-flake.yml
vendored
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
||||||
|
name: "Publish a flake to flakestry & flakehub"
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- "*"
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
tag:
|
||||||
|
description: "The existing tag to publish"
|
||||||
|
type: "string"
|
||||||
|
required: true
|
||||||
|
jobs:
|
||||||
|
flakestry-publish:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
id-token: "write"
|
||||||
|
contents: "read"
|
||||||
|
steps:
|
||||||
|
- uses: flakestry/flakestry-publish@main
|
||||||
|
with:
|
||||||
|
version: "${{ inputs.tag || github.ref_name }}"
|
||||||
|
flakehub-publish:
|
||||||
|
runs-on: "ubuntu-latest"
|
||||||
|
permissions:
|
||||||
|
id-token: "write"
|
||||||
|
contents: "read"
|
||||||
|
steps:
|
||||||
|
- uses: "actions/checkout@v4"
|
||||||
|
with:
|
||||||
|
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
||||||
|
- uses: "DeterminateSystems/nix-installer-action@main"
|
||||||
|
- uses: "DeterminateSystems/flakehub-push@main"
|
||||||
|
with:
|
||||||
|
visibility: "public"
|
||||||
|
tag: "${{ inputs.tag }}"
|
18
.github/workflows/python-check-requirements.yml
vendored
18
.github/workflows/python-check-requirements.yml
vendored
|
@ -3,20 +3,16 @@ name: Python check requirements.txt
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
paths:
|
paths:
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
|
||||||
- 'scripts/check-requirements.sh'
|
- 'scripts/check-requirements.sh'
|
||||||
- 'convert*.py'
|
- 'convert*.py'
|
||||||
- '**/requirements*.txt'
|
- 'requirements.txt'
|
||||||
|
- 'requirements/*.txt'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
|
||||||
- 'scripts/check-requirements.sh'
|
- 'scripts/check-requirements.sh'
|
||||||
- 'convert*.py'
|
- 'convert*.py'
|
||||||
- '**/requirements*.txt'
|
- 'requirements.txt'
|
||||||
|
- 'requirements/*.txt'
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
python-check-requirements:
|
python-check-requirements:
|
||||||
|
@ -24,10 +20,10 @@ jobs:
|
||||||
name: check-requirements
|
name: check-requirements
|
||||||
steps:
|
steps:
|
||||||
- name: Check out source repository
|
- name: Check out source repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
- name: Set up Python environment
|
- name: Set up Python environment
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- name: Run check-requirements.sh script
|
- name: Run check-requirements.sh script
|
||||||
run: bash scripts/check-requirements.sh
|
run: bash scripts/check-requirements.sh nocleanup
|
||||||
|
|
20
.github/workflows/python-lint.yml
vendored
20
.github/workflows/python-lint.yml
vendored
|
@ -1,17 +1,6 @@
|
||||||
name: flake8 Lint
|
name: flake8 Lint
|
||||||
|
|
||||||
on:
|
on: [push, pull_request]
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
flake8-lint:
|
flake8-lint:
|
||||||
|
@ -19,12 +8,13 @@ jobs:
|
||||||
name: Lint
|
name: Lint
|
||||||
steps:
|
steps:
|
||||||
- name: Check out source repository
|
- name: Check out source repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
- name: Set up Python environment
|
- name: Set up Python environment
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- name: flake8 Lint
|
- name: flake8 Lint
|
||||||
uses: py-actions/flake8@v2
|
uses: py-actions/flake8@v2
|
||||||
with:
|
with:
|
||||||
plugins: "flake8-no-print"
|
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
|
||||||
|
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
||||||
|
|
40
.github/workflows/python-type-check.yml
vendored
40
.github/workflows/python-type-check.yml
vendored
|
@ -1,40 +0,0 @@
|
||||||
name: Python Type-Check
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/python-type-check.yml'
|
|
||||||
- 'pyrightconfig.json'
|
|
||||||
- '**.py'
|
|
||||||
- '**/requirements*.txt'
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/python-type-check.yml'
|
|
||||||
- 'pyrightconfig.json'
|
|
||||||
- '**.py'
|
|
||||||
- '**/requirements*.txt'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
python-type-check:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
name: pyright type-check
|
|
||||||
steps:
|
|
||||||
- name: Check out source repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Set up Python environment
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Install Python dependencies
|
|
||||||
# TODO: use a venv
|
|
||||||
run: pip install -r requirements/requirements-all.txt
|
|
||||||
- name: Type-check with Pyright
|
|
||||||
uses: jakebailey/pyright-action@v2
|
|
||||||
with:
|
|
||||||
version: 1.1.382
|
|
||||||
level: warning
|
|
||||||
warnings: true
|
|
234
.github/workflows/server.yml
vendored
234
.github/workflows/server.yml
vendored
|
@ -3,32 +3,13 @@ name: Server
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # allows manual triggering
|
workflow_dispatch: # allows manual triggering
|
||||||
inputs:
|
|
||||||
sha:
|
|
||||||
description: 'Commit SHA1 to build'
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
slow_tests:
|
|
||||||
description: 'Run slow tests'
|
|
||||||
required: true
|
|
||||||
type: boolean
|
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||||
|
|
||||||
env:
|
|
||||||
LLAMA_LOG_COLORS: 1
|
|
||||||
LLAMA_LOG_PREFIX: 1
|
|
||||||
LLAMA_LOG_TIMESTAMPS: 1
|
|
||||||
LLAMA_LOG_VERBOSITY: 10
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
server:
|
server:
|
||||||
|
@ -36,204 +17,67 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
build_type: [RelWithDebInfo]
|
build_type: [Debug, Release]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
sanitizer: ""
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
exclude:
|
||||||
|
- build_type: Release
|
||||||
|
sanitizer: ADDRESS
|
||||||
|
- build_type: Release
|
||||||
|
sanitizer: THREAD
|
||||||
|
- build_type: Release
|
||||||
|
sanitizer: UNDEFINED
|
||||||
|
|
||||||
|
container:
|
||||||
|
image: ubuntu:latest
|
||||||
|
ports:
|
||||||
|
- 8888
|
||||||
|
options: --cpus 4
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
apt-get update
|
||||||
sudo apt-get -y install \
|
apt-get -y install \
|
||||||
build-essential \
|
build-essential \
|
||||||
xxd \
|
|
||||||
git \
|
git \
|
||||||
cmake \
|
cmake \
|
||||||
curl \
|
python3-pip \
|
||||||
wget \
|
wget \
|
||||||
language-pack-en \
|
psmisc
|
||||||
libcurl4-openssl-dev
|
|
||||||
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Python setup
|
|
||||||
id: setup_python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: '3.11'
|
|
||||||
|
|
||||||
- name: Tests dependencies
|
|
||||||
id: test_dependencies
|
|
||||||
run: |
|
|
||||||
pip install -r examples/server/tests/requirements.txt
|
|
||||||
|
|
||||||
# Setup nodejs (to be used for verifying bundled index.html)
|
|
||||||
- uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: '22.11.0'
|
|
||||||
|
|
||||||
- name: WebUI - Install dependencies
|
|
||||||
id: webui_lint
|
|
||||||
run: |
|
|
||||||
cd examples/server/webui
|
|
||||||
npm ci
|
|
||||||
|
|
||||||
- name: WebUI - Check code format
|
|
||||||
id: webui_format
|
|
||||||
run: |
|
|
||||||
git config --global --add safe.directory $(realpath .)
|
|
||||||
cd examples/server/webui
|
|
||||||
git status
|
|
||||||
|
|
||||||
npm run format
|
|
||||||
git status
|
|
||||||
modified_files="$(git status -s)"
|
|
||||||
echo "Modified files: ${modified_files}"
|
|
||||||
if [ -n "${modified_files}" ]; then
|
|
||||||
echo "Files do not follow coding style. To fix: npm run format"
|
|
||||||
echo "${modified_files}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Verify bundled index.html
|
|
||||||
id: verify_server_index_html
|
|
||||||
run: |
|
|
||||||
git config --global --add safe.directory $(realpath .)
|
|
||||||
cd examples/server/webui
|
|
||||||
git status
|
|
||||||
|
|
||||||
npm run build
|
|
||||||
git status
|
|
||||||
modified_files="$(git status -s)"
|
|
||||||
echo "Modified files: ${modified_files}"
|
|
||||||
if [ -n "${modified_files}" ]; then
|
|
||||||
echo "Repository is dirty or server/webui is not built as expected"
|
|
||||||
echo "Hint: You may need to follow Web UI build guide in server/README.md"
|
|
||||||
echo "${modified_files}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Build (no OpenMP)
|
|
||||||
id: cmake_build_no_openmp
|
|
||||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
|
||||||
-DGGML_OPENMP=OFF ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build
|
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Tests
|
|
||||||
id: server_integration_tests
|
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
./tests.sh
|
|
||||||
|
|
||||||
- name: Tests (sanitizers)
|
|
||||||
id: server_integration_tests_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
LLAMA_SANITIZE=1 ./tests.sh
|
|
||||||
|
|
||||||
- name: Slow tests
|
|
||||||
id: server_integration_tests_slow
|
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
SLOW_TESTS=1 ./tests.sh
|
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
|
||||||
runs-on: windows-2019
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: libCURL
|
|
||||||
id: get_libcurl
|
|
||||||
env:
|
|
||||||
CURL_VERSION: 8.6.0_6
|
|
||||||
run: |
|
|
||||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
|
|
||||||
mkdir $env:RUNNER_TEMP/libcurl
|
|
||||||
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
mkdir build
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
cd build
|
||||||
|
cmake .. \
|
||||||
- name: Python setup
|
-DLLAMA_NATIVE=OFF \
|
||||||
id: setup_python
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
uses: actions/setup-python@v5
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
with:
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
python-version: '3.11'
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||||
|
|
||||||
- name: Tests dependencies
|
- name: Tests dependencies
|
||||||
id: test_dependencies
|
id: test_dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/server/tests/requirements.txt
|
pip install -r examples/server/tests/requirements.txt
|
||||||
|
|
||||||
- name: Copy Libcurl
|
- name: Download models
|
||||||
id: prepare_libcurl
|
id: download_models
|
||||||
run: |
|
run: |
|
||||||
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
|
cd examples/server/tests
|
||||||
|
../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_test
|
||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
$env:PYTHONIOENCODING = ":replace"
|
PORT=8888 ./tests.sh
|
||||||
pytest -v -x -m "not slow"
|
|
||||||
|
|
||||||
- name: Slow tests
|
|
||||||
id: server_integration_tests_slow
|
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
$env:SLOW_TESTS = "1"
|
|
||||||
pytest -v -x
|
|
||||||
|
|
20
.github/workflows/tidy-post.yml
vendored
Normal file
20
.github/workflows/tidy-post.yml
vendored
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
name: clang-tidy review post comments
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
workflows: ["clang-tidy-review"]
|
||||||
|
types:
|
||||||
|
- completed
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: ZedThree/clang-tidy-review/post@v0.13.0
|
||||||
|
# lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
|
||||||
|
with:
|
||||||
|
# adjust options as necessary
|
||||||
|
lgtm_comment_body: ''
|
||||||
|
annotations: false
|
||||||
|
max_comments: 25
|
23
.github/workflows/tidy-review.yml
vendored
Normal file
23
.github/workflows/tidy-review.yml
vendored
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
name: clang-tidy-review
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
clang-tidy-review:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- uses: ZedThree/clang-tidy-review@v0.13.0
|
||||||
|
id: review
|
||||||
|
with:
|
||||||
|
lgtm_comment_body: ''
|
||||||
|
build_dir: build
|
||||||
|
cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
|
||||||
|
split_workflow: true
|
||||||
|
|
||||||
|
- uses: ZedThree/clang-tidy-review/upload@v0.13.0
|
25
.github/workflows/zig-build.yml
vendored
Normal file
25
.github/workflows/zig-build.yml
vendored
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
name: Zig CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
runs-on: [ubuntu-latest, macos-latest, windows-latest]
|
||||||
|
runs-on: ${{ matrix.runs-on }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
fetch-depth: 0
|
||||||
|
- uses: goto-bus-stop/setup-zig@v2
|
||||||
|
with:
|
||||||
|
version: 0.11.0
|
||||||
|
- name: Build Summary
|
||||||
|
run: zig build --summary all -freference-trace
|
179
.gitignore
vendored
179
.gitignore
vendored
|
@ -1,145 +1,94 @@
|
||||||
# Extensions
|
|
||||||
|
|
||||||
*.a
|
|
||||||
*.bat
|
|
||||||
*.bin
|
|
||||||
*.d
|
|
||||||
*.dll
|
|
||||||
*.dot
|
|
||||||
*.etag
|
|
||||||
*.exe
|
|
||||||
*.gcda
|
|
||||||
*.gcno
|
|
||||||
*.gcov
|
|
||||||
*.gguf
|
|
||||||
*.gguf.json
|
|
||||||
*.lastModified
|
|
||||||
*.log
|
|
||||||
*.metallib
|
|
||||||
*.o
|
*.o
|
||||||
|
*.a
|
||||||
*.so
|
*.so
|
||||||
*.swp
|
*.gguf
|
||||||
*.tmp
|
*.bin
|
||||||
|
*.exe
|
||||||
# IDE / OS
|
*.dll
|
||||||
|
*.log
|
||||||
|
*.gcov
|
||||||
|
*.gcno
|
||||||
|
*.gcda
|
||||||
|
*.dot
|
||||||
|
*.bat
|
||||||
|
*.metallib
|
||||||
|
.DS_Store
|
||||||
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
.ccls-cache/
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
.DS_Store
|
|
||||||
.envrc
|
.envrc
|
||||||
.idea/
|
|
||||||
.swiftpm
|
.swiftpm
|
||||||
|
.venv
|
||||||
|
.clang-tidy
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
nppBackup
|
.idea/
|
||||||
|
|
||||||
|
|
||||||
# Coverage
|
|
||||||
|
|
||||||
gcovr-report/
|
|
||||||
lcov-report/
|
lcov-report/
|
||||||
|
gcovr-report/
|
||||||
|
|
||||||
# Build Artifacts
|
|
||||||
|
|
||||||
tags
|
|
||||||
.build/
|
|
||||||
build*
|
build*
|
||||||
!build-info.cmake
|
|
||||||
!build-info.cpp.in
|
|
||||||
!build-info.sh
|
|
||||||
!build.zig
|
|
||||||
!docs/build.md
|
|
||||||
/libllama.so
|
|
||||||
/llama-*
|
|
||||||
/vulkan-shaders-gen
|
|
||||||
android-ndk-*
|
|
||||||
arm_neon.h
|
|
||||||
cmake-build-*
|
cmake-build-*
|
||||||
CMakeSettings.json
|
|
||||||
compile_commands.json
|
|
||||||
ggml-metal-embed.metal
|
|
||||||
llama-batched-swift
|
|
||||||
/rpc-server
|
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
autogen-*.md
|
|
||||||
|
|
||||||
# Deprecated
|
|
||||||
|
|
||||||
/main
|
|
||||||
/server
|
|
||||||
|
|
||||||
# CI
|
|
||||||
|
|
||||||
!.github/workflows/*.yml
|
|
||||||
|
|
||||||
# Models
|
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
models-mnt
|
models-mnt
|
||||||
!models/.editorconfig
|
|
||||||
!models/ggml-vocab-*.gguf*
|
|
||||||
|
|
||||||
# Zig
|
/Pipfile
|
||||||
|
/baby-llama
|
||||||
|
/beam-search
|
||||||
|
/benchmark-matmult
|
||||||
|
/convert-llama2c-to-ggml
|
||||||
|
/embd-input-test
|
||||||
|
/embedding
|
||||||
|
/gguf
|
||||||
|
/gguf-llama-simple
|
||||||
|
/imatrix
|
||||||
|
/infill
|
||||||
|
/libllama.so
|
||||||
|
/llama-bench
|
||||||
|
/llava-cli
|
||||||
|
/lookahead
|
||||||
|
/lookup
|
||||||
|
/main
|
||||||
|
/metal
|
||||||
|
/passkey
|
||||||
|
/perplexity
|
||||||
|
/q8dot
|
||||||
|
/quantize
|
||||||
|
/quantize-stats
|
||||||
|
/result
|
||||||
|
/save-load-state
|
||||||
|
/server
|
||||||
|
/simple
|
||||||
|
/batched
|
||||||
|
/batched-bench
|
||||||
|
/export-lora
|
||||||
|
/finetune
|
||||||
|
/speculative
|
||||||
|
/parallel
|
||||||
|
/train-text-from-scratch
|
||||||
|
/tokenize
|
||||||
|
/vdot
|
||||||
|
/common/build-info.cpp
|
||||||
|
arm_neon.h
|
||||||
|
compile_commands.json
|
||||||
|
CMakeSettings.json
|
||||||
|
|
||||||
|
__pycache__
|
||||||
|
dist
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
# Logs
|
|
||||||
|
|
||||||
ppl-*.txt
|
ppl-*.txt
|
||||||
qnt-*.txt
|
qnt-*.txt
|
||||||
perf-*.txt
|
perf-*.txt
|
||||||
|
|
||||||
# Examples
|
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
examples/server/*.css.hpp
|
|
||||||
examples/server/*.html.hpp
|
|
||||||
examples/server/*.js.hpp
|
|
||||||
examples/server/*.mjs.hpp
|
|
||||||
!build_64.sh
|
|
||||||
!examples/*.bat
|
|
||||||
!examples/*/*.kts
|
|
||||||
!examples/*/*/*.kts
|
|
||||||
!examples/sycl/*.bat
|
|
||||||
!examples/sycl/*.sh
|
|
||||||
|
|
||||||
# Server Web UI temporary files
|
poetry.lock
|
||||||
node_modules
|
|
||||||
examples/server/webui/dist
|
|
||||||
|
|
||||||
# Python
|
|
||||||
|
|
||||||
/.venv
|
|
||||||
__pycache__/
|
|
||||||
*/poetry.lock
|
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
nppBackup
|
||||||
# Nix
|
|
||||||
/result
|
|
||||||
|
|
||||||
# Test binaries
|
|
||||||
/tests/test-backend-ops
|
|
||||||
/tests/test-double-float
|
|
||||||
/tests/test-grad0
|
|
||||||
/tests/test-grammar-parser
|
|
||||||
/tests/test-llama-grammar
|
|
||||||
/tests/test-opt
|
|
||||||
/tests/test-quantize-fns
|
|
||||||
/tests/test-quantize-perf
|
|
||||||
/tests/test-rope
|
|
||||||
/tests/test-sampling
|
|
||||||
/tests/test-tokenizer-0
|
|
||||||
/tests/test-tokenizer-1-bpe
|
|
||||||
/tests/test-tokenizer-1-spm
|
|
||||||
|
|
||||||
# Scripts
|
|
||||||
!/scripts/install-oneapi.bat
|
|
||||||
|
|
||||||
# Test models for lora adapters
|
|
||||||
/lora-tests
|
|
||||||
|
|
||||||
# Local scripts
|
|
||||||
/run-vim.sh
|
|
||||||
/run-chat.sh
|
|
||||||
|
|
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
||||||
[submodule "kompute"]
|
[submodule "kompute"]
|
||||||
path = ggml/src/ggml-kompute/kompute
|
path = kompute
|
||||||
url = https://github.com/nomic-ai/kompute.git
|
url = https://github.com/nomic-ai/kompute.git
|
||||||
|
|
|
@ -3,14 +3,13 @@
|
||||||
exclude: prompts/.*.txt
|
exclude: prompts/.*.txt
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.6.0
|
rev: v3.2.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: check-added-large-files
|
- id: check-added-large-files
|
||||||
- repo: https://github.com/PyCQA/flake8
|
- repo: https://github.com/PyCQA/flake8
|
||||||
rev: 7.0.0
|
rev: 6.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
additional_dependencies: [flake8-no-print]
|
|
||||||
|
|
1244
CMakeLists.txt
1244
CMakeLists.txt
File diff suppressed because it is too large
Load diff
|
@ -1,97 +0,0 @@
|
||||||
{
|
|
||||||
"version": 4,
|
|
||||||
"configurePresets": [
|
|
||||||
{
|
|
||||||
"name": "base",
|
|
||||||
"hidden": true,
|
|
||||||
"generator": "Ninja",
|
|
||||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "sycl-base",
|
|
||||||
"hidden": true,
|
|
||||||
"generator": "Ninja",
|
|
||||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
|
||||||
"CMAKE_CXX_COMPILER": "icx",
|
|
||||||
"CMAKE_C_COMPILER": "cl",
|
|
||||||
"GGML_SYCL": "ON",
|
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
|
||||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
|
||||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
|
||||||
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "x64-windows-llvm", "hidden": true,
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "arm64-windows-llvm", "hidden": true,
|
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "arm64-apple-clang", "hidden": true,
|
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
|
||||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
|
||||||
"cacheVariables": {
|
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
|
|
||||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
|
||||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
|
||||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
|
||||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
|
||||||
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
|
|
||||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
|
||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
|
|
||||||
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
|
||||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
|
||||||
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
|
|
||||||
|
|
||||||
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
|
|
||||||
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
|
|
||||||
]
|
|
||||||
}
|
|
11
CODEOWNERS
11
CODEOWNERS
|
@ -1,11 +0,0 @@
|
||||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
|
||||||
|
|
||||||
/ci/ @ggerganov
|
|
||||||
/.devops/*.Dockerfile @ngxson
|
|
||||||
/examples/server/ @ngxson
|
|
||||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
|
||||||
/ggml/src/gguf.cpp @JohannesGaessler
|
|
125
CONTRIBUTING.md
125
CONTRIBUTING.md
|
@ -1,125 +0,0 @@
|
||||||
# Pull requests (for contributors)
|
|
||||||
|
|
||||||
- Test your changes:
|
|
||||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
|
||||||
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
|
||||||
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
|
||||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
|
||||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
|
||||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
|
||||||
|
|
||||||
# Pull requests (for collaborators)
|
|
||||||
|
|
||||||
- Squash-merge PRs
|
|
||||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
|
||||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
|
||||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
|
|
||||||
|
|
||||||
# Coding guidelines
|
|
||||||
|
|
||||||
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
|
||||||
- Always consider cross-compatibility with other operating systems and architectures
|
|
||||||
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
|
|
||||||
- Vertical alignment makes things more readable and easier to batch edit
|
|
||||||
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
|
||||||
- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
|
|
||||||
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
|
|
||||||
- In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
|
|
||||||
```cpp
|
|
||||||
// OK
|
|
||||||
llama_context * ctx;
|
|
||||||
const llama_rope_type rope_type;
|
|
||||||
|
|
||||||
// not OK
|
|
||||||
struct llama_context * ctx;
|
|
||||||
const enum llama_rope_type rope_type;
|
|
||||||
```
|
|
||||||
|
|
||||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
|
|
||||||
|
|
||||||
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
|
|
||||||
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
|
|
||||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
|
||||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
# Naming guidelines
|
|
||||||
|
|
||||||
- Use `snake_case` for function, variable and type names
|
|
||||||
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
// not OK
|
|
||||||
int small_number;
|
|
||||||
int big_number;
|
|
||||||
|
|
||||||
// OK
|
|
||||||
int number_small;
|
|
||||||
int number_big;
|
|
||||||
```
|
|
||||||
|
|
||||||
- Enum values are always in upper case and prefixed with the enum name
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
enum llama_vocab_type {
|
|
||||||
LLAMA_VOCAB_TYPE_NONE = 0,
|
|
||||||
LLAMA_VOCAB_TYPE_SPM = 1,
|
|
||||||
LLAMA_VOCAB_TYPE_BPE = 2,
|
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3,
|
|
||||||
LLAMA_VOCAB_TYPE_UGM = 4,
|
|
||||||
LLAMA_VOCAB_TYPE_RWKV = 5,
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
llama_model_init(); // class: "llama_model", method: "init"
|
|
||||||
llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
|
|
||||||
llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed"
|
|
||||||
llama_set_embeddings(); // class: "llama_context", method: "set_embeddings"
|
|
||||||
llama_n_threads(); // class: "llama_context", method: "n_threads"
|
|
||||||
llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free"
|
|
||||||
```
|
|
||||||
|
|
||||||
- The `get` `<action>` can be omitted
|
|
||||||
- The `<noun>` can be omitted if not necessary
|
|
||||||
- The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
|
|
||||||
- Use `init`/`free` for constructor/destructor `<action>`
|
|
||||||
|
|
||||||
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
typedef struct llama_context * llama_context_t;
|
|
||||||
|
|
||||||
enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
|
|
||||||
```
|
|
||||||
|
|
||||||
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
|
|
||||||
|
|
||||||
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
|
|
||||||
- Python filenames are all lowercase with underscores
|
|
||||||
|
|
||||||
- _(TODO: abbreviations usage)_
|
|
||||||
|
|
||||||
# Preprocessor directives
|
|
||||||
|
|
||||||
- _(TODO: add guidelines with examples and apply them to the codebase)_
|
|
||||||
|
|
||||||
```cpp
|
|
||||||
#ifdef FOO
|
|
||||||
#endif // FOO
|
|
||||||
```
|
|
||||||
|
|
||||||
# Documentation
|
|
||||||
|
|
||||||
- Documentation is a community effort
|
|
||||||
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
|
|
||||||
- When you notice incorrect or outdated documentation, please update it
|
|
||||||
|
|
||||||
# Resources
|
|
||||||
|
|
||||||
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
|
|
||||||
|
|
||||||
https://github.com/ggerganov/llama.cpp/projects
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2023-2024 The ggml authors
|
Copyright (c) 2023 Georgi Gerganov
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -14,6 +14,47 @@ let package = Package(
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "llama", targets: ["llama"]),
|
||||||
],
|
],
|
||||||
targets: [
|
targets: [
|
||||||
.systemLibrary(name: "llama", pkgConfig: "llama"),
|
.target(
|
||||||
]
|
name: "llama",
|
||||||
|
path: ".",
|
||||||
|
exclude: [
|
||||||
|
"cmake",
|
||||||
|
"examples",
|
||||||
|
"scripts",
|
||||||
|
"models",
|
||||||
|
"tests",
|
||||||
|
"CMakeLists.txt",
|
||||||
|
"ggml-cuda.cu",
|
||||||
|
"ggml-cuda.h",
|
||||||
|
"Makefile"
|
||||||
|
],
|
||||||
|
sources: [
|
||||||
|
"ggml.c",
|
||||||
|
"llama.cpp",
|
||||||
|
"ggml-alloc.c",
|
||||||
|
"ggml-backend.c",
|
||||||
|
"ggml-quants.c",
|
||||||
|
"ggml-metal.m",
|
||||||
|
],
|
||||||
|
resources: [
|
||||||
|
.process("ggml-metal.metal")
|
||||||
|
],
|
||||||
|
publicHeadersPath: "spm-headers",
|
||||||
|
cSettings: [
|
||||||
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
|
.define("GGML_USE_ACCELERATE"),
|
||||||
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
|
.define("GGML_USE_METAL"),
|
||||||
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
|
],
|
||||||
|
linkerSettings: [
|
||||||
|
.linkedFramework("Accelerate")
|
||||||
|
]
|
||||||
|
)
|
||||||
|
],
|
||||||
|
cxxLanguageStandard: .cxx11
|
||||||
)
|
)
|
||||||
|
|
494
README-sycl.md
Normal file
494
README-sycl.md
Normal file
|
@ -0,0 +1,494 @@
|
||||||
|
# llama.cpp for SYCL
|
||||||
|
|
||||||
|
- [Background](#background)
|
||||||
|
- [OS](#os)
|
||||||
|
- [Intel GPU](#intel-gpu)
|
||||||
|
- [Docker](#docker)
|
||||||
|
- [Linux](#linux)
|
||||||
|
- [Windows](#windows)
|
||||||
|
- [Environment Variable](#environment-variable)
|
||||||
|
- [Known Issue](#known-issue)
|
||||||
|
- [Q&A](#q&a)
|
||||||
|
- [Todo](#todo)
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||||
|
|
||||||
|
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||||
|
|
||||||
|
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||||
|
|
||||||
|
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||||
|
|
||||||
|
The llama.cpp for SYCL is used to support Intel GPUs.
|
||||||
|
|
||||||
|
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||||
|
|
||||||
|
## OS
|
||||||
|
|
||||||
|
|OS|Status|Verified|
|
||||||
|
|-|-|-|
|
||||||
|
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|
||||||
|
|Windows|Support|Windows 11|
|
||||||
|
|
||||||
|
|
||||||
|
## Intel GPU
|
||||||
|
|
||||||
|
### Verified
|
||||||
|
|
||||||
|
|Intel GPU| Status | Verified Model|
|
||||||
|
|-|-|-|
|
||||||
|
|Intel Data Center Max Series| Support| Max 1550|
|
||||||
|
|Intel Data Center Flex Series| Support| Flex 170|
|
||||||
|
|Intel Arc Series| Support| Arc 770, 730M|
|
||||||
|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||||
|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
|
||||||
|
|
||||||
|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
|
||||||
|
|
||||||
|
### Memory
|
||||||
|
|
||||||
|
The memory is a limitation to run LLM on GPUs.
|
||||||
|
|
||||||
|
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
|
||||||
|
|
||||||
|
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
|
||||||
|
|
||||||
|
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- Only docker on Linux is tested. Docker on WSL may not work.
|
||||||
|
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
|
||||||
|
|
||||||
|
### Build the image
|
||||||
|
|
||||||
|
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
|
||||||
|
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# For F16:
|
||||||
|
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
||||||
|
|
||||||
|
# Or, for F32:
|
||||||
|
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
|
||||||
|
|
||||||
|
# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Firstly, find all the DRI cards:
|
||||||
|
ls -la /dev/dri
|
||||||
|
# Then, pick the card that you want to use.
|
||||||
|
|
||||||
|
# For example with "/dev/dri/card1"
|
||||||
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
```
|
||||||
|
|
||||||
|
## Linux
|
||||||
|
|
||||||
|
### Setup Environment
|
||||||
|
|
||||||
|
1. Install Intel GPU driver.
|
||||||
|
|
||||||
|
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||||
|
|
||||||
|
Note: for iGPU, please install the client GPU driver.
|
||||||
|
|
||||||
|
b. Add user to group: video, render.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo usermod -aG render username
|
||||||
|
sudo usermod -aG video username
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: re-login to enable it.
|
||||||
|
|
||||||
|
c. Check
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo apt install clinfo
|
||||||
|
sudo clinfo -l
|
||||||
|
```
|
||||||
|
|
||||||
|
Output (example):
|
||||||
|
|
||||||
|
```
|
||||||
|
Platform #0: Intel(R) OpenCL Graphics
|
||||||
|
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||||
|
|
||||||
|
|
||||||
|
Platform #0: Intel(R) OpenCL HD Graphics
|
||||||
|
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install Intel® oneAPI Base toolkit.
|
||||||
|
|
||||||
|
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||||
|
|
||||||
|
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||||
|
|
||||||
|
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||||
|
|
||||||
|
b. Check
|
||||||
|
|
||||||
|
```sh
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
sycl-ls
|
||||||
|
```
|
||||||
|
|
||||||
|
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
|
||||||
|
|
||||||
|
Output (example):
|
||||||
|
```
|
||||||
|
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||||
|
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||||
|
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||||
|
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Build locally:
|
||||||
|
|
||||||
|
Note:
|
||||||
|
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
|
||||||
|
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
# For FP16:
|
||||||
|
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
|
# Or, for FP32:
|
||||||
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
# Build example/main only
|
||||||
|
#cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
# Or, build all binary
|
||||||
|
cmake --build . --config Release -v
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./examples/sycl/build.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
1. Put model file to folder **models**
|
||||||
|
|
||||||
|
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
|
||||||
|
|
||||||
|
2. Enable oneAPI running environment
|
||||||
|
|
||||||
|
```
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
3. List device ID
|
||||||
|
|
||||||
|
Run without parameter:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./build/bin/ls-sycl-device
|
||||||
|
|
||||||
|
# or running the "main" executable and look at the output log:
|
||||||
|
|
||||||
|
./build/bin/main
|
||||||
|
```
|
||||||
|
|
||||||
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
|
```
|
||||||
|
found 4 SYCL devices:
|
||||||
|
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||||
|
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||||
|
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|Attribute|Note|
|
||||||
|
|-|-|
|
||||||
|
|compute capability 1.3|Level-zero running time, recommended |
|
||||||
|
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||||
|
|
||||||
|
4. Set device ID and execute llama.cpp
|
||||||
|
|
||||||
|
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||||
|
|
||||||
|
```sh
|
||||||
|
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
```
|
||||||
|
or run by script:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./examples/sycl/run_llama2.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||||
|
|
||||||
|
|
||||||
|
5. Check the device ID in output
|
||||||
|
|
||||||
|
Like:
|
||||||
|
```
|
||||||
|
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||||
|
```
|
||||||
|
|
||||||
|
## Windows
|
||||||
|
|
||||||
|
### Setup Environment
|
||||||
|
|
||||||
|
1. Install Intel GPU driver.
|
||||||
|
|
||||||
|
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
||||||
|
|
||||||
|
Note: **The driver is mandatory for compute function**.
|
||||||
|
|
||||||
|
2. Install Visual Studio.
|
||||||
|
|
||||||
|
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
|
||||||
|
|
||||||
|
3. Install Intel® oneAPI Base toolkit.
|
||||||
|
|
||||||
|
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||||
|
|
||||||
|
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
|
||||||
|
|
||||||
|
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||||
|
|
||||||
|
b. Enable oneAPI running environment:
|
||||||
|
|
||||||
|
- In Search, input 'oneAPI'.
|
||||||
|
|
||||||
|
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
|
||||||
|
|
||||||
|
- In Run:
|
||||||
|
|
||||||
|
In CMD:
|
||||||
|
```
|
||||||
|
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||||
|
```
|
||||||
|
|
||||||
|
c. Check GPU
|
||||||
|
|
||||||
|
In oneAPI command line:
|
||||||
|
|
||||||
|
```
|
||||||
|
sycl-ls
|
||||||
|
```
|
||||||
|
|
||||||
|
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
|
||||||
|
|
||||||
|
Output (example):
|
||||||
|
```
|
||||||
|
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||||
|
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||||
|
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
|
||||||
|
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Install cmake & make
|
||||||
|
|
||||||
|
a. Download & install cmake for Windows: https://cmake.org/download/
|
||||||
|
|
||||||
|
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
||||||
|
|
||||||
|
- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||||
|
|
||||||
|
- Extract `w64devkit` on your pc.
|
||||||
|
|
||||||
|
- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
|
||||||
|
|
||||||
|
### Build locally:
|
||||||
|
|
||||||
|
In oneAPI command line window:
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
|
:: for FP16
|
||||||
|
:: faster for long-prompt inference
|
||||||
|
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
|
:: for FP32
|
||||||
|
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
|
|
||||||
|
:: build example/main only
|
||||||
|
:: make main
|
||||||
|
|
||||||
|
:: build all binary
|
||||||
|
make -j
|
||||||
|
cd ..
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
.\examples\sycl\win-build-sycl.bat
|
||||||
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
1. Put model file to folder **models**
|
||||||
|
|
||||||
|
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
|
||||||
|
|
||||||
|
2. Enable oneAPI running environment
|
||||||
|
|
||||||
|
- In Search, input 'oneAPI'.
|
||||||
|
|
||||||
|
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
|
||||||
|
|
||||||
|
- In Run:
|
||||||
|
|
||||||
|
In CMD:
|
||||||
|
```
|
||||||
|
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||||
|
```
|
||||||
|
|
||||||
|
3. List device ID
|
||||||
|
|
||||||
|
Run without parameter:
|
||||||
|
|
||||||
|
```
|
||||||
|
build\bin\ls-sycl-device.exe
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
build\bin\main.exe
|
||||||
|
```
|
||||||
|
|
||||||
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
|
```
|
||||||
|
found 4 SYCL devices:
|
||||||
|
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||||
|
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||||
|
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||||
|
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||||
|
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|Attribute|Note|
|
||||||
|
|-|-|
|
||||||
|
|compute capability 1.3|Level-zero running time, recommended |
|
||||||
|
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||||
|
|
||||||
|
4. Set device ID and execute llama.cpp
|
||||||
|
|
||||||
|
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
|
||||||
|
|
||||||
|
```
|
||||||
|
set GGML_SYCL_DEVICE=0
|
||||||
|
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
|
||||||
|
```
|
||||||
|
or run by script:
|
||||||
|
|
||||||
|
```
|
||||||
|
.\examples\sycl\win-run-llama2.bat
|
||||||
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||||
|
|
||||||
|
|
||||||
|
5. Check the device ID in output
|
||||||
|
|
||||||
|
Like:
|
||||||
|
```
|
||||||
|
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||||
|
```
|
||||||
|
|
||||||
|
## Environment Variable
|
||||||
|
|
||||||
|
#### Build
|
||||||
|
|
||||||
|
|Name|Value|Function|
|
||||||
|
|-|-|-|
|
||||||
|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||||
|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||||
|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||||
|
|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
|
||||||
|
|
||||||
|
#### Running
|
||||||
|
|
||||||
|
|
||||||
|
|Name|Value|Function|
|
||||||
|
|-|-|-|
|
||||||
|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||||
|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||||
|
|
||||||
|
## Known Issue
|
||||||
|
|
||||||
|
- Hang during startup
|
||||||
|
|
||||||
|
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||||
|
|
||||||
|
Solution: add **--no-mmap** or **--mmap 0**.
|
||||||
|
|
||||||
|
## Q&A
|
||||||
|
|
||||||
|
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||||
|
|
||||||
|
Miss to enable oneAPI running environment.
|
||||||
|
|
||||||
|
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||||
|
|
||||||
|
- In Windows, no result, not error.
|
||||||
|
|
||||||
|
Miss to enable oneAPI running environment.
|
||||||
|
|
||||||
|
- Meet compile error.
|
||||||
|
|
||||||
|
Remove folder **build** and try again.
|
||||||
|
|
||||||
|
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
|
||||||
|
|
||||||
|
Please run **sudo sycl-ls**.
|
||||||
|
|
||||||
|
If you see it in result, please add video/render group to your ID:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo usermod -aG render username
|
||||||
|
sudo usermod -aG video username
|
||||||
|
```
|
||||||
|
|
||||||
|
Then **relogin**.
|
||||||
|
|
||||||
|
If you do not see it, please check the installation GPU steps again.
|
||||||
|
|
||||||
|
## Todo
|
||||||
|
|
||||||
|
- Support multiple cards.
|
67
SECURITY.md
67
SECURITY.md
|
@ -1,67 +0,0 @@
|
||||||
# Security Policy
|
|
||||||
|
|
||||||
- [**Using llama.cpp securely**](#using-llamacpp-securely)
|
|
||||||
- [Untrusted models](#untrusted-models)
|
|
||||||
- [Untrusted inputs](#untrusted-inputs)
|
|
||||||
- [Data privacy](#data-privacy)
|
|
||||||
- [Untrusted environments or networks](#untrusted-environments-or-networks)
|
|
||||||
- [Multi-Tenant environments](#multi-tenant-environments)
|
|
||||||
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
|
||||||
|
|
||||||
## Using llama.cpp securely
|
|
||||||
|
|
||||||
### Untrusted models
|
|
||||||
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
|
|
||||||
|
|
||||||
*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
|
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
|
|
||||||
|
|
||||||
### Untrusted inputs
|
|
||||||
|
|
||||||
Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
|
|
||||||
|
|
||||||
For maximum security when handling untrusted inputs, you may need to employ the following:
|
|
||||||
|
|
||||||
* Sandboxing: Isolate the environment where the inference happens.
|
|
||||||
* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
|
|
||||||
* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
|
|
||||||
* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
|
|
||||||
* Validation: Enforce strict rules on allowed characters and data types.
|
|
||||||
* Filtering: Remove potentially malicious scripts or code fragments.
|
|
||||||
* Encoding: Convert special characters into safe representations.
|
|
||||||
* Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
|
|
||||||
|
|
||||||
### Data privacy
|
|
||||||
|
|
||||||
To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
|
|
||||||
|
|
||||||
### Untrusted environments or networks
|
|
||||||
|
|
||||||
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
|
|
||||||
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
|
|
||||||
* Encrypt your data if sending it over the network.
|
|
||||||
|
|
||||||
### Multi-Tenant environments
|
|
||||||
|
|
||||||
If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
|
|
||||||
|
|
||||||
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
|
|
||||||
|
|
||||||
2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
|
|
||||||
|
|
||||||
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
|
||||||
|
|
||||||
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
|
||||||
|
|
||||||
## Reporting a vulnerability
|
|
||||||
|
|
||||||
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
|
|
||||||
|
|
||||||
<!-- normal version -->
|
|
||||||
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
|
||||||
|
|
||||||
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
|
|
||||||
|
|
||||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
|
|
@ -1,4 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <llama.h>
|
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
module llama [system] {
|
|
||||||
header "llama.h"
|
|
||||||
link "llama"
|
|
||||||
export *
|
|
||||||
}
|
|
116
awq-py/README.md
Normal file
116
awq-py/README.md
Normal file
|
@ -0,0 +1,116 @@
|
||||||
|
# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
|
||||||
|
[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
|
||||||
|
|
||||||
|
**Supported models:**
|
||||||
|
|
||||||
|
- [X] LLaMA
|
||||||
|
- [x] LLaMA 2
|
||||||
|
- [X] MPT
|
||||||
|
- [X] Mistral AI v0.1
|
||||||
|
- [ ] Bloom
|
||||||
|
- [ ] Mixtral MoE
|
||||||
|
|
||||||
|
**TODO:**
|
||||||
|
- [x] Update version work with both MPT and MPT-AWQ model
|
||||||
|
- [ ] Add OPT model
|
||||||
|
- [ ] Add Bloom model
|
||||||
|
- [ ] Add Mixtral MoE
|
||||||
|
- [ ] Support w3, w2
|
||||||
|
|
||||||
|
|
||||||
|
## Contents
|
||||||
|
|
||||||
|
- [Install](##Install)
|
||||||
|
- [Convert](##Convert)
|
||||||
|
- [Quantize](##Quantize)
|
||||||
|
- [Test](##Test)
|
||||||
|
- [Benchmark](##Benchmark)
|
||||||
|
- [Results](##Results)
|
||||||
|
|
||||||
|
## Install
|
||||||
|
Install requirements
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
|
||||||
|
```bash
|
||||||
|
git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
|
||||||
|
```
|
||||||
|
|
||||||
|
## Convert
|
||||||
|
Example for llama model
|
||||||
|
```bash
|
||||||
|
# For llama7b and llama2 models
|
||||||
|
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
|
||||||
|
# For mistral and mpt models
|
||||||
|
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quantize
|
||||||
|
```bash
|
||||||
|
# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
|
||||||
|
./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Test
|
||||||
|
```bash
|
||||||
|
# For all models.
|
||||||
|
./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Benchmark
|
||||||
|
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
|
||||||
|
```bash
|
||||||
|
# For llama and llama2, and mistral models.
|
||||||
|
./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
|
||||||
|
```
|
||||||
|
|
||||||
|
## Results
|
||||||
|
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
|
||||||
|
We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
|
||||||
|
|
||||||
|
### Llama 7B (Build with OpenBLAS)
|
||||||
|
|
||||||
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||||
|
|-----------:|--------------|-------:|-------:|-------:|-------:|
|
||||||
|
|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
|
||||||
|
|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||||
|
|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||||
|
|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
|
||||||
|
|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||||
|
|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||||
|
|
||||||
|
|
||||||
|
### Llama2 7B (Build with CuBLAS)
|
||||||
|
|
||||||
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||||
|
|------------:|--------------|-------:|-------:|-------:|-------:|
|
||||||
|
|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
|
||||||
|
|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||||
|
|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||||
|
|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
|
||||||
|
|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||||
|
|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||||
|
|
||||||
|
|
||||||
|
### Mistral 7B v0.1 (Build with CuBLAS)
|
||||||
|
|
||||||
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||||
|
|-------------:|--------------|-------:|-------:|-------:|-------:|
|
||||||
|
|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
|
||||||
|
|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G |
|
||||||
|
|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||||
|
|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
|
||||||
|
|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G |
|
||||||
|
|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||||
|
|
||||||
|
### MPT 7B (Build with OpenBLAS)
|
||||||
|
|
||||||
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||||
|
|---------:|--------------|-------:|-------:|-------:|--------:|
|
||||||
|
|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
|
||||||
|
|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G |
|
||||||
|
|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||||
|
|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873|
|
||||||
|
|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G |
|
||||||
|
|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
254
awq-py/awq/apply_awq.py
Normal file
254
awq-py/awq/apply_awq.py
Normal file
|
@ -0,0 +1,254 @@
|
||||||
|
"""
|
||||||
|
Implements the AWQ for llama.cpp use cases.
|
||||||
|
Original paper: https://arxiv.org/abs/2306.00978
|
||||||
|
|
||||||
|
This code is based on versions of the AWQ implementation found in the following repositories:
|
||||||
|
* https://github.com/mit-han-lab/llm-awq
|
||||||
|
* https://github.com/casper-hansen/AutoAWQ
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from transformers import AutoModelForCausalLM, AutoConfig
|
||||||
|
from transformers.models.bloom.modeling_bloom import BloomGelu
|
||||||
|
from transformers.models.llama.modeling_llama import LlamaRMSNorm
|
||||||
|
from transformers.activations import GELUActivation
|
||||||
|
|
||||||
|
|
||||||
|
class ScaledActivation(nn.Module):
|
||||||
|
"""
|
||||||
|
ScaledActivation module wraps an existing activation function and applies a
|
||||||
|
scale factor to its output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
module (nn.Module): The activation function to be scaled.
|
||||||
|
scales (torch.Tensor): A tensor of size (num_features,) containing the initial
|
||||||
|
scale factors for each feature.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
torch.Tensor: The scaled output of the activation function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, module, scales):
|
||||||
|
super().__init__()
|
||||||
|
self.act = module
|
||||||
|
self.scales = nn.Parameter(scales.data)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
|
||||||
|
|
||||||
|
|
||||||
|
def set_op_by_name(layer, name, new_module):
|
||||||
|
"""
|
||||||
|
Set the new module for given module's name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
layer (nn.Module): The layer in which to replace the submodule.
|
||||||
|
name (str): The path to the submodule to be replaced, using dot notation
|
||||||
|
to access nested modules.
|
||||||
|
new_module (nn.Module): The new module to replace the existing one.
|
||||||
|
"""
|
||||||
|
levels = name.split(".")
|
||||||
|
if len(levels) > 1:
|
||||||
|
mod_ = layer
|
||||||
|
for l_idx in range(len(levels) - 1):
|
||||||
|
if levels[l_idx].isdigit():
|
||||||
|
mod_ = mod_[int(levels[l_idx])]
|
||||||
|
else:
|
||||||
|
mod_ = getattr(mod_, levels[l_idx])
|
||||||
|
setattr(mod_, levels[-1], new_module)
|
||||||
|
else:
|
||||||
|
setattr(layer, name, new_module)
|
||||||
|
|
||||||
|
|
||||||
|
def get_op_by_name(module, op_name):
|
||||||
|
"""
|
||||||
|
Retrieves a submodule within a given layer based on its name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
module (nn.Module): The layer containing the submodule to find.
|
||||||
|
op_name (str): The name of the submodule.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
nn.Module: The requested submodule found within the given layer.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the specified submodule cannot be found within the layer.
|
||||||
|
"""
|
||||||
|
for name, m in module.named_modules():
|
||||||
|
if name == op_name:
|
||||||
|
return m
|
||||||
|
raise ValueError(f"Cannot find op {op_name} in module {module}")
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def scale_ln_fcs(ln, fcs, scales):
|
||||||
|
"""
|
||||||
|
Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ln (nn.LayerNorm): The LayerNorm module to be scaled.
|
||||||
|
fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
|
||||||
|
scales (torch.Tensor): A 1D tensor of size (num_features,).
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not isinstance(fcs, list):
|
||||||
|
fcs = [fcs]
|
||||||
|
|
||||||
|
scales = scales.to(ln.weight.device)
|
||||||
|
|
||||||
|
ln.weight.div_(scales)
|
||||||
|
if hasattr(ln, "bias") and ln.bias is not None:
|
||||||
|
ln.bias.div_(scales)
|
||||||
|
|
||||||
|
for fc in fcs:
|
||||||
|
fc.weight.mul_(scales.view(1, -1))
|
||||||
|
|
||||||
|
for p in ln.parameters():
|
||||||
|
assert torch.isnan(p).sum() == 0
|
||||||
|
for fc in fcs:
|
||||||
|
for p in fc.parameters():
|
||||||
|
assert torch.isnan(p).sum() == 0
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def scale_fc_fc(fc1, fc2, scales):
|
||||||
|
"""
|
||||||
|
Scales the weights of two fully-connected layers in a specific pattern.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fc1 (nn.Linear): The first fully-connected layer to be scaled.
|
||||||
|
fc2 (nn.Linear): The second fully-connected layer to be scaled.
|
||||||
|
scales (torch.Tensor): A 1D tensor of size (num_features,).
|
||||||
|
"""
|
||||||
|
assert isinstance(fc1, nn.Linear)
|
||||||
|
assert isinstance(fc2, nn.Linear)
|
||||||
|
|
||||||
|
scales = scales.to(fc1.weight.device)
|
||||||
|
|
||||||
|
fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
|
||||||
|
if fc1.bias is not None:
|
||||||
|
fc1.bias.div_(scales.view(-1))
|
||||||
|
|
||||||
|
fc2.weight.mul_(scales.view(1, -1))
|
||||||
|
|
||||||
|
for p in fc1.parameters():
|
||||||
|
assert torch.isnan(p).sum() == 0
|
||||||
|
for p in fc2.parameters():
|
||||||
|
assert torch.isnan(p).sum() == 0
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def scale_gelu_fc(gelu, fc, scales):
|
||||||
|
"""
|
||||||
|
Scales the weight of a GELU activation and a fully-connected layer proportionally.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
|
||||||
|
fc (nn.Linear): The fully-connected layer to be scaled.
|
||||||
|
scales (torch.Tensor): A 1D tensor of size (num_features,).
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
|
||||||
|
TypeError: If the `fc` module is not of type `nn.Linear`.
|
||||||
|
"""
|
||||||
|
assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
|
||||||
|
assert isinstance(fc, nn.Linear)
|
||||||
|
|
||||||
|
fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
|
||||||
|
|
||||||
|
for p in fc.parameters():
|
||||||
|
assert torch.isnan(p).sum() == 0
|
||||||
|
|
||||||
|
|
||||||
|
def apply_scale(module, scales_list, input_feat_dict=None):
|
||||||
|
"""
|
||||||
|
Applies different scaling strategies to layers based on their type and hierarchy within a given module.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
module (nn.Module): The module containing the layers to be scaled.
|
||||||
|
scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
|
||||||
|
* prev_op_name (str): The name of the preceding operation or module,
|
||||||
|
relative to which the layers to be scaled are located.
|
||||||
|
* layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
|
||||||
|
* scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
|
||||||
|
input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
|
||||||
|
input features (optional).
|
||||||
|
"""
|
||||||
|
for prev_op_name, layer_names, scales in scales_list:
|
||||||
|
prev_op = get_op_by_name(module, prev_op_name)
|
||||||
|
layers = [get_op_by_name(module, name) for name in layer_names]
|
||||||
|
|
||||||
|
prev_op.cuda()
|
||||||
|
for layer in layers:
|
||||||
|
layer.cuda()
|
||||||
|
scales.cuda()
|
||||||
|
|
||||||
|
if isinstance(prev_op, nn.Linear):
|
||||||
|
assert len(layers) == 1
|
||||||
|
scale_fc_fc(prev_op, layers[0], scales)
|
||||||
|
elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
|
||||||
|
scale_ln_fcs(prev_op, layers, scales)
|
||||||
|
elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
|
||||||
|
new_module = ScaledActivation(prev_op, scales)
|
||||||
|
set_op_by_name(module, prev_op_name, new_module)
|
||||||
|
scale_gelu_fc(prev_op, layers[0], scales)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
|
||||||
|
|
||||||
|
# apply the scaling to input feat if given; prepare it for clipping
|
||||||
|
if input_feat_dict is not None:
|
||||||
|
for layer_name in layer_names:
|
||||||
|
inp = input_feat_dict[layer_name]
|
||||||
|
inp.div_(scales.view(1, -1).to(inp.device))
|
||||||
|
|
||||||
|
prev_op.cpu()
|
||||||
|
for layer in layers:
|
||||||
|
layer.cpu()
|
||||||
|
scales.cpu()
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def apply_clip(module, clip_list):
|
||||||
|
"""
|
||||||
|
Applies element-wise clipping to the weight of a specific layer within a given module.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
module (nn.Module): The module containing the layer to be clipped.
|
||||||
|
clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
|
||||||
|
* name (str): The name of the layer to be clipped, relative to the root of the module.
|
||||||
|
* max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
|
||||||
|
"""
|
||||||
|
for name, max_val in clip_list:
|
||||||
|
layer = get_op_by_name(module, name)
|
||||||
|
layer.cuda()
|
||||||
|
max_val = max_val.to(layer.weight.device)
|
||||||
|
org_shape = layer.weight.shape
|
||||||
|
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
|
||||||
|
layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
|
||||||
|
layer.weight.data = layer.weight.data.reshape(org_shape)
|
||||||
|
layer.cpu()
|
||||||
|
|
||||||
|
|
||||||
|
def add_scale_weights(model_path, scale_path, tmp_path):
|
||||||
|
"""
|
||||||
|
Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
|
||||||
|
including scaling factors and clipping bounds.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_path (str): Path to the pre-trained model to be equipped with AWQ.
|
||||||
|
scale_path (str): Path to the AWQ scale factors (.pt file).
|
||||||
|
tmp_path (str): Path to the temporary directory where the equipped model will be saved.
|
||||||
|
"""
|
||||||
|
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_path, config=config, trust_remote_code=True
|
||||||
|
)
|
||||||
|
model.eval()
|
||||||
|
awq_results = torch.load(str(scale_path), map_location="cpu")
|
||||||
|
apply_scale(model, awq_results["scale"])
|
||||||
|
apply_clip(model, awq_results["clip"])
|
||||||
|
model.save_pretrained(str(tmp_path))
|
||||||
|
os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
|
2
awq-py/requirements.txt
Normal file
2
awq-py/requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
torch>=2.1.1
|
||||||
|
transformers>=4.32.0
|
139
build.zig
Normal file
139
build.zig
Normal file
|
@ -0,0 +1,139 @@
|
||||||
|
// Compatible with Zig Version 0.11.0
|
||||||
|
const std = @import("std");
|
||||||
|
const ArrayList = std.ArrayList;
|
||||||
|
const Compile = std.Build.Step.Compile;
|
||||||
|
const ConfigHeader = std.Build.Step.ConfigHeader;
|
||||||
|
const Mode = std.builtin.Mode;
|
||||||
|
const CrossTarget = std.zig.CrossTarget;
|
||||||
|
|
||||||
|
const Maker = struct {
|
||||||
|
builder: *std.build.Builder,
|
||||||
|
target: CrossTarget,
|
||||||
|
optimize: Mode,
|
||||||
|
enable_lto: bool,
|
||||||
|
|
||||||
|
include_dirs: ArrayList([]const u8),
|
||||||
|
cflags: ArrayList([]const u8),
|
||||||
|
cxxflags: ArrayList([]const u8),
|
||||||
|
objs: ArrayList(*Compile),
|
||||||
|
|
||||||
|
fn addInclude(m: *Maker, dir: []const u8) !void {
|
||||||
|
try m.include_dirs.append(dir);
|
||||||
|
}
|
||||||
|
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
|
||||||
|
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
|
||||||
|
}
|
||||||
|
fn addCFlag(m: *Maker, flag: []const u8) !void {
|
||||||
|
try m.cflags.append(flag);
|
||||||
|
}
|
||||||
|
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
|
||||||
|
try m.cxxflags.append(flag);
|
||||||
|
}
|
||||||
|
fn addFlag(m: *Maker, flag: []const u8) !void {
|
||||||
|
try m.addCFlag(flag);
|
||||||
|
try m.addCxxFlag(flag);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn init(builder: *std.build.Builder) !Maker {
|
||||||
|
const target = builder.standardTargetOptions(.{});
|
||||||
|
const zig_version = @import("builtin").zig_version_string;
|
||||||
|
const commit_hash = try std.ChildProcess.exec(
|
||||||
|
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||||
|
);
|
||||||
|
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
||||||
|
\\int LLAMA_BUILD_NUMBER = {};
|
||||||
|
\\char const *LLAMA_COMMIT = "{s}";
|
||||||
|
\\char const *LLAMA_COMPILER = "Zig {s}";
|
||||||
|
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
||||||
|
\\
|
||||||
|
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
||||||
|
var m = Maker{
|
||||||
|
.builder = builder,
|
||||||
|
.target = target,
|
||||||
|
.optimize = builder.standardOptimizeOption(.{}),
|
||||||
|
.enable_lto = false,
|
||||||
|
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
||||||
|
.cflags = ArrayList([]const u8).init(builder.allocator),
|
||||||
|
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
||||||
|
.objs = ArrayList(*Compile).init(builder.allocator),
|
||||||
|
};
|
||||||
|
|
||||||
|
try m.addCFlag("-std=c11");
|
||||||
|
try m.addCxxFlag("-std=c++11");
|
||||||
|
try m.addProjectInclude(&.{});
|
||||||
|
try m.addProjectInclude(&.{"common"});
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
||||||
|
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||||
|
if (o.target.getAbi() != .msvc)
|
||||||
|
o.defineCMacro("_GNU_SOURCE", null);
|
||||||
|
|
||||||
|
if (std.mem.endsWith(u8, src, ".c")) {
|
||||||
|
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||||
|
o.linkLibC();
|
||||||
|
} else {
|
||||||
|
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||||
|
if (o.target.getAbi() == .msvc) {
|
||||||
|
o.linkLibC(); // need winsdk + crt
|
||||||
|
} else {
|
||||||
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||||
|
o.linkLibCpp();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||||
|
o.want_lto = m.enable_lto;
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
|
||||||
|
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||||
|
e.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||||
|
for (deps) |d| e.addObject(d);
|
||||||
|
for (m.objs.items) |o| e.addObject(o);
|
||||||
|
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
||||||
|
|
||||||
|
// https://github.com/ziglang/zig/issues/15448
|
||||||
|
if (e.target.getAbi() == .msvc) {
|
||||||
|
e.linkLibC(); // need winsdk + crt
|
||||||
|
} else {
|
||||||
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||||
|
e.linkLibCpp();
|
||||||
|
}
|
||||||
|
m.builder.installArtifact(e);
|
||||||
|
e.want_lto = m.enable_lto;
|
||||||
|
return e;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn build(b: *std.build.Builder) !void {
|
||||||
|
var make = try Maker.init(b);
|
||||||
|
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
||||||
|
|
||||||
|
const ggml = make.obj("ggml", "ggml.c");
|
||||||
|
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||||
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||||
|
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||||
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
|
const buildinfo = make.obj("common", "common/build-info.cpp");
|
||||||
|
const common = make.obj("common", "common/common.cpp");
|
||||||
|
const console = make.obj("console", "common/console.cpp");
|
||||||
|
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||||
|
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||||
|
const train = make.obj("train", "common/train.cpp");
|
||||||
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
|
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
||||||
|
|
||||||
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
||||||
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||||
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||||
|
|
||||||
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
|
||||||
|
if (server.target.isWindows()) {
|
||||||
|
server.linkSystemLibrary("ws2_32");
|
||||||
|
}
|
||||||
|
}
|
619
ci/run.sh
619
ci/run.sh
|
@ -1,4 +1,4 @@
|
||||||
#!/bin/bash
|
#/bin/bash
|
||||||
#
|
#
|
||||||
# sample usage:
|
# sample usage:
|
||||||
#
|
#
|
||||||
|
@ -13,9 +13,6 @@
|
||||||
# # with SYCL support
|
# # with SYCL support
|
||||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
# # with VULKAN support
|
|
||||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
#
|
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
|
@ -39,25 +36,20 @@ SRC=`pwd`
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
if [ -z ${ONEAPI_ROOT} ]; then
|
if [ -z ${ONEAPI_ROOT} ]; then
|
||||||
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
|
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
|
||||||
echo "source /opt/intel/oneapi/setvars.sh"
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
|
||||||
fi
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
|
@ -110,11 +102,8 @@ function gg_run_ctest_debug {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Check cmake, make and ctest are installed
|
|
||||||
gg_check_build_requirements
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
|
@ -141,11 +130,8 @@ function gg_run_ctest_release {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Check cmake, make and ctest are installed
|
|
||||||
gg_check_build_requirements
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
@ -166,64 +152,13 @@ function gg_sum_ctest_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
# test_scripts_debug
|
|
||||||
|
|
||||||
function gg_run_test_scripts_debug {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_test_scripts_debug {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs test scripts in debug mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
# test_scripts_release
|
|
||||||
|
|
||||||
function gg_run_test_scripts_release {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_test_scripts_release {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs test scripts in release mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_get_model {
|
function gg_get_model {
|
||||||
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
||||||
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||||
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
if [[ -s $gguf_3b ]]; then
|
||||||
if [[ -s $gguf_0 ]]; then
|
echo -n "$gguf_3b"
|
||||||
echo -n "$gguf_0"
|
elif [[ -s $gguf_7b ]]; then
|
||||||
elif [[ -s $gguf_1 ]]; then
|
echo -n "$gguf_7b"
|
||||||
echo -n "$gguf_1"
|
|
||||||
elif [[ -s $gguf_2 ]]; then
|
|
||||||
echo -n "$gguf_2"
|
|
||||||
else
|
else
|
||||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -272,34 +207,33 @@ function gg_sum_ctest_with_model_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_3b_v2
|
||||||
|
|
||||||
function gg_run_open_llama_7b_v2 {
|
function gg_run_open_llama_3b_v2 {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||||
|
|
||||||
path_models="../models-mnt/open-llama/7B-v2"
|
path_models="../models-mnt/open-llama/3B-v2"
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert.py ${path_models}
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -313,49 +247,46 @@ function gg_run_open_llama_7b_v2 {
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -384,148 +315,58 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
set +e
|
# lora
|
||||||
}
|
function compare_ppl {
|
||||||
|
|
||||||
function gg_sum_open_llama_7b_v2 {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
||||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
|
||||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
|
||||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
|
||||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
|
||||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# pythia_1.4b
|
|
||||||
|
|
||||||
function gg_run_pythia_1_4b {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
|
||||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
|
||||||
|
|
||||||
path_models="../models-mnt/pythia/1.4B"
|
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
|
||||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
|
||||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
|
||||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
|
||||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
|
||||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
|
||||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||||
return 20
|
return 20
|
||||||
fi
|
fi
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
path_lora="../models-mnt/open-llama/3B-v2/lora"
|
||||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
path_shakespeare="../models-mnt/shakespeare"
|
||||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||||
|
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||||
|
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||||
|
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||||
|
|
||||||
|
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||||
|
|
||||||
|
# f16
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||||
|
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||||
|
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0 + f16 lora-base
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||||
|
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_sum_pythia_1_4b {
|
function gg_sum_open_llama_3b_v2 {
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
gg_printf 'Pythia 1.4B:\n'
|
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
@ -538,33 +379,42 @@ function gg_sum_pythia_1_4b {
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||||
|
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
# pythia_2_8b
|
# open_llama_7b_v2
|
||||||
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
function gg_run_pythia_2_8b {
|
function gg_run_open_llama_7b_v2 {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
|
||||||
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
||||||
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
||||||
|
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
|
||||||
path_models="../models-mnt/pythia/2.8B"
|
path_models="../models-mnt/open-llama/7B-v2"
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert.py ${path_models}
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -580,47 +430,44 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -641,7 +488,7 @@ function gg_run_pythia_2_8b {
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
@ -649,16 +496,59 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
|
# lora
|
||||||
|
function compare_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
path_lora="../models-mnt/open-llama/7B-v2/lora"
|
||||||
|
path_shakespeare="../models-mnt/shakespeare"
|
||||||
|
|
||||||
|
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||||
|
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||||
|
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||||
|
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||||
|
|
||||||
|
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||||
|
|
||||||
|
# f16
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||||
|
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# currently not supported by the CUDA backend
|
||||||
|
# q8_0
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||||
|
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0 + f16 lora-base
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||||
|
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_sum_pythia_2_8b {
|
function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
gg_printf 'Pythia 2.8B:\n'
|
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
@ -671,6 +561,11 @@ function gg_sum_pythia_2_8b {
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||||
|
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
# bge-small
|
# bge-small
|
||||||
|
@ -679,7 +574,7 @@ function gg_run_embd_bge_small {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
||||||
|
@ -697,17 +592,17 @@ function gg_run_embd_bge_small {
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert-hf-to-gguf.py ${path_models}
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -721,92 +616,8 @@ function gg_sum_embd_bge_small {
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
# rerank_tiny
|
|
||||||
|
|
||||||
function gg_run_rerank_tiny {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
|
|
||||||
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
|
|
||||||
|
|
||||||
path_models="../models-mnt/rerank-tiny"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
|
|
||||||
# sample output
|
|
||||||
# rerank score 0: 0.029
|
|
||||||
# rerank score 1: 0.029
|
|
||||||
# rerank score 2: 0.135
|
|
||||||
|
|
||||||
# check that the score is in the range [$3, $4]
|
|
||||||
function check_score {
|
|
||||||
qnt="$1"
|
|
||||||
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$score"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_rerank_tiny {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Rerank Tiny (Jina):\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_check_build_requirements {
|
|
||||||
if ! command -v cmake &> /dev/null; then
|
|
||||||
gg_printf 'cmake not found, please install'
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v make &> /dev/null; then
|
|
||||||
gg_printf 'make not found, please install'
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v ctest &> /dev/null; then
|
|
||||||
gg_printf 'ctest not found, please install'
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
||||||
export LLAMA_LOG_PREFIX=1
|
|
||||||
export LLAMA_LOG_TIMESTAMPS=1
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||||
rm -rf ${SRC}/models-mnt
|
rm -rf ${SRC}/models-mnt
|
||||||
|
@ -815,10 +626,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
# Create a fresh python3 venv and enter it
|
# Create a fresh python3 venv and enter it
|
||||||
if ! python3 -m venv "$MNT/venv"; then
|
python3 -m venv "$MNT/venv"
|
||||||
echo "Error: Failed to create Python virtual environment at $MNT/venv."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "$MNT/venv/bin/activate"
|
source "$MNT/venv/bin/activate"
|
||||||
|
|
||||||
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
||||||
|
@ -832,19 +640,12 @@ test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
test $ret -eq 0 && gg_run embd_bge_small
|
test $ret -eq 0 && gg_run embd_bge_small
|
||||||
test $ret -eq 0 && gg_run rerank_tiny
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
|
||||||
test $ret -eq 0 && gg_run test_scripts_debug
|
|
||||||
test $ret -eq 0 && gg_run test_scripts_release
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run pythia_1_4b
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run pythia_2_8b
|
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||||
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
|
||||||
fi
|
fi
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||||
|
|
|
@ -79,22 +79,22 @@ endmacro()
|
||||||
# flags are for MSVC only!
|
# flags are for MSVC only!
|
||||||
check_sse("AVX" " ;/arch:AVX")
|
check_sse("AVX" " ;/arch:AVX")
|
||||||
if (NOT ${AVX_FOUND})
|
if (NOT ${AVX_FOUND})
|
||||||
set(GGML_AVX OFF)
|
set(LLAMA_AVX OFF)
|
||||||
else()
|
else()
|
||||||
set(GGML_AVX ON)
|
set(LLAMA_AVX ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_sse("AVX2" " ;/arch:AVX2")
|
check_sse("AVX2" " ;/arch:AVX2")
|
||||||
check_sse("FMA" " ;/arch:AVX2")
|
check_sse("FMA" " ;/arch:AVX2")
|
||||||
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||||
set(GGML_AVX2 OFF)
|
set(LLAMA_AVX2 OFF)
|
||||||
else()
|
else()
|
||||||
set(GGML_AVX2 ON)
|
set(LLAMA_AVX2 ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_sse("AVX512" " ;/arch:AVX512")
|
check_sse("AVX512" " ;/arch:AVX512")
|
||||||
if (NOT ${AVX512_FOUND})
|
if (NOT ${AVX512_FOUND})
|
||||||
set(GGML_AVX512 OFF)
|
set(LLAMA_AVX512 OFF)
|
||||||
else()
|
else()
|
||||||
set(GGML_AVX512 ON)
|
set(LLAMA_AVX512 ON)
|
||||||
endif()
|
endif()
|
|
@ -1,16 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Darwin )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
||||||
|
|
||||||
set( target arm64-apple-darwin-macho )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
|
@ -1,16 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Windows )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
||||||
|
|
||||||
set( target arm64-pc-windows-msvc )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
|
|
@ -1,6 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Windows )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
|
||||||
|
|
||||||
set( target arm64-pc-windows-msvc )
|
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
|
@ -1,33 +0,0 @@
|
||||||
function(llama_add_compile_flags)
|
|
||||||
if (LLAMA_FATAL_WARNINGS)
|
|
||||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|
||||||
list(APPEND C_FLAGS -Werror)
|
|
||||||
list(APPEND CXX_FLAGS -Werror)
|
|
||||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
|
||||||
add_compile_options(/WX)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS)
|
|
||||||
if (NOT MSVC)
|
|
||||||
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
|
||||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
|
||||||
|
|
||||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
|
||||||
|
|
||||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
|
||||||
|
|
||||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
|
||||||
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
|
||||||
|
|
||||||
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
|
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
|
|
||||||
else()
|
|
||||||
# todo : msvc
|
|
||||||
set(C_FLAGS "" PARENT_SCOPE)
|
|
||||||
set(CXX_FLAGS "" PARENT_SCOPE)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
endfunction()
|
|
|
@ -1,22 +0,0 @@
|
||||||
find_package(Git)
|
|
||||||
|
|
||||||
# the commit's SHA1
|
|
||||||
execute_process(COMMAND
|
|
||||||
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
|
||||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
||||||
OUTPUT_VARIABLE GIT_SHA1
|
|
||||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
|
|
||||||
# the date of the commit
|
|
||||||
execute_process(COMMAND
|
|
||||||
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
|
||||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
||||||
OUTPUT_VARIABLE GIT_DATE
|
|
||||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
|
|
||||||
# the subject of the commit
|
|
||||||
execute_process(COMMAND
|
|
||||||
"${GIT_EXECUTABLE}" log -1 --format=%s
|
|
||||||
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
||||||
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
|
||||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
|
@ -1,30 +0,0 @@
|
||||||
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
|
||||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
|
||||||
|
|
||||||
@PACKAGE_INIT@
|
|
||||||
|
|
||||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
|
||||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
|
||||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
|
||||||
|
|
||||||
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
|
|
||||||
|
|
||||||
find_library(llama_LIBRARY llama
|
|
||||||
REQUIRED
|
|
||||||
HINTS ${LLAMA_LIB_DIR}
|
|
||||||
NO_CMAKE_FIND_ROOT_PATH
|
|
||||||
)
|
|
||||||
|
|
||||||
add_library(llama UNKNOWN IMPORTED)
|
|
||||||
set_target_properties(llama
|
|
||||||
PROPERTIES
|
|
||||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
|
||||||
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
|
|
||||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
|
||||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
|
||||||
INTERFACE_COMPILE_FEATURES c_std_90
|
|
||||||
POSITION_INDEPENDENT_CODE ON)
|
|
||||||
|
|
||||||
check_required_components(Llama)
|
|
|
@ -1,10 +0,0 @@
|
||||||
prefix=@CMAKE_INSTALL_PREFIX@
|
|
||||||
exec_prefix=@CMAKE_INSTALL_PREFIX@
|
|
||||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
|
||||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
|
||||||
|
|
||||||
Name: llama
|
|
||||||
Description: Port of Facebook's LLaMA model in C/C++
|
|
||||||
Version: @LLAMA_INSTALL_VERSION@
|
|
||||||
Libs: -L${libdir} -lggml -lggml-base -lllama
|
|
||||||
Cflags: -I${includedir}
|
|
|
@ -1,11 +0,0 @@
|
||||||
set( CMAKE_SYSTEM_NAME Windows )
|
|
||||||
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
|
||||||
|
|
||||||
set( CMAKE_C_COMPILER clang )
|
|
||||||
set( CMAKE_CXX_COMPILER clang++ )
|
|
||||||
|
|
||||||
set( arch_c_flags "-march=native" )
|
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
|
|
||||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
|
|
||||||
|
|
14
codecov.yml
Normal file
14
codecov.yml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
comment: off
|
||||||
|
|
||||||
|
coverage:
|
||||||
|
status:
|
||||||
|
project:
|
||||||
|
default:
|
||||||
|
target: auto
|
||||||
|
threshold: 0
|
||||||
|
base: auto
|
||||||
|
patch:
|
||||||
|
default:
|
||||||
|
target: auto
|
||||||
|
threshold: 0
|
||||||
|
base: auto
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue