Compare commits

..

6 commits

Author SHA1 Message Date
Georgi Gerganov
12aa74ba7d
minor : spacing 2024-03-22 15:24:57 +02:00
fraxy-v
2605c139a6
Update build.yml 2024-03-20 08:59:38 +02:00
fraxy-v
3e9d3dbff9
Update build.yml 2024-03-20 08:50:46 +02:00
fraxy-v
6014a63125
Update build.yml 2024-03-19 16:52:01 +02:00
Y. Velkov
927be9b58e add test in build action 2024-03-19 16:25:23 +02:00
Y. Velkov
284800b1e3 convert-llama2c-to-ggml: enable conversion of multiqueries, #5608 2024-03-19 15:23:59 +02:00
1195 changed files with 199369 additions and 314735 deletions

View file

@ -1,161 +0,0 @@
---
Language: Cpp
AlignAfterOpenBracket: Align
AlignArrayOfStructures: Left
AlignConsecutiveAssignments: AcrossComments
AlignConsecutiveBitFields: AcrossComments
AlignConsecutiveDeclarations: AcrossComments
AlignConsecutiveMacros: AcrossComments
# AlignConsecutiveShortCaseStatements: AcrossComments
AlignEscapedNewlines: Left # LeftWithLastLine
AlignOperands: Align
AlignTrailingComments:
Kind: Always
OverEmptyLines: 1
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: false
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: true
BinPackParameters: true # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
AfterCaseLabel: true
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
# BreakAdjacentStringLiterals: true
BreakAfterAttributes: Never
BreakBeforeBinaryOperators: None
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: false
# BreakBinaryOperations: Never
BreakConstructorInitializers: AfterColon
# BreakFunctionDefinitionParameters: false
BreakInheritanceList: AfterComma
BreakStringLiterals: true
# BreakTemplateDeclarations: Yes
ColumnLimit: 120
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
EmptyLineBeforeAccessModifier: Leave
EmptyLineAfterAccessModifier: Never
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
Priority: 2
SortPriority: 0
- Regex: '.*'
Priority: 3
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseBlocks: true
IndentCaseLabels: true
IndentExternBlock: NoIndent
IndentGotoLabels: false
IndentPPDirectives: AfterHash
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertBraces: true # NOTE: may lead to incorrect formatting
InsertNewlineAtEOF: true
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
LambdaBodyIndentation: Signature
LineEnding: LF
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PPIndentWidth: -1
PackConstructorInitializers: CurrentLine
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Middle
QualifierAlignment: Left
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
ReferenceAlignment: Middle
ReflowComments: false # IndentOnly
SeparateDefinitionBlocks: Always
SortIncludes: CaseInsensitive
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: true
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: Never
SpacesInContainerLiterals: true
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
Standard: c++17
TabWidth: 4
UseTab: Never
WhitespaceSensitiveMacros: ['STRINGIZE']
...

View file

@ -12,15 +12,12 @@ Checks: >
-readability-implicit-bool-conversion, -readability-implicit-bool-conversion,
-readability-magic-numbers, -readability-magic-numbers,
-readability-uppercase-literal-suffix, -readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
clang-analyzer-*, clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*, performance-*,
portability-*, portability-*,
-portability-simd-intrinsics,
misc-*, misc-*,
-misc-const-correctness, -misc-const-correctness,
-misc-non-private-member-variables-in-classes, -misc-non-private-member-variables-in-classes,
-misc-no-recursion, -misc-no-recursion,
-misc-use-anonymous-namespace,
FormatStyle: none FormatStyle: none

View file

@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
stage('Running llama.cpp'){ stage('Running llama.cpp'){
sh'''#!/bin/bash sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results cat llama_log.txt # Printing results
''' '''
} }

View file

@ -1,92 +0,0 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH
ARG GGML_CPU_ARM_ARCH=armv8-a
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "$TARGETARCH" = "amd64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \
fi && \
cmake --build build -j $(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -1,94 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
WORKDIR /app
COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -0,0 +1,34 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -0,0 +1,45 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

22
.devops/full.Dockerfile Normal file
View file

@ -0,0 +1,22 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
RUN make
ENV LC_ALL=C.utf8
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -1,91 +0,0 @@
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
## Build Image
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
### Full
FROM base AS full
COPY --from=build /app/lib/ /app
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -1,44 +0,0 @@
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
FROM ascendai/cann:$ASCEND_VERSION AS build
WORKDIR /app
COPY . .
RUN yum install -y gcc g++ cmake make
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
# find libascend_hal.so, because the drive hasn`t been mounted.
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
# TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
ENTRYPOINT ["/llama-cli" ]

View file

@ -0,0 +1,84 @@
# SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-clblast
Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: OpenCL Inference of LLaMA model in C/C++
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
Requires: clblast
URL: https://github.com/ggerganov/llama.cpp
%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
%description
CPU inference for Meta's Lllama2 models using default options.
%prep
%setup -n llama.cpp-master
%build
make -j LLAMA_CLBLAST=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamaclblast
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
[Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never
[Install]
WantedBy=default.target
EOF
mkdir -p %{buildroot}/etc/sysconfig
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
EOF
%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamaclblast
%{_bindir}/llamaclblastserver
%{_bindir}/llamaclblastsimple
/usr/lib/systemd/system/llamaclblast.service
%config /etc/sysconfig/llama
%pre
%post
%preun
%postun
%changelog

View file

@ -1,5 +1,5 @@
# SRPM for building from source and packaging an RPM for RPM-based distros. # SRPM for building from source and packaging an RPM for RPM-based distros.
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com # Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
@ -12,7 +12,7 @@
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support. # It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-cuda Name: llama.cpp-cublas
Version: %( date "+%%Y%%m%%d" ) Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist} Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options.
%setup -n llama.cpp-master %setup -n llama.cpp-master
%build %build
make -j GGML_CUDA=1 make -j LLAMA_CUBLAS=1
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli cp -p main %{buildroot}%{_bindir}/llamacppcublas
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
mkdir -p %{buildroot}/usr/lib/systemd/system mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
[Unit] [Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build). Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=/etc/sysconfig/llama EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID ExecReload=/bin/kill -s HUP $MAINPID
Restart=never Restart=never
@ -67,10 +67,10 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/* rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama-cuda-cli %{_bindir}/llamacppcublas
%{_bindir}/llama-cuda-server %{_bindir}/llamacppcublasserver
%{_bindir}/llama-cuda-simple %{_bindir}/llamacppcublassimple
/usr/lib/systemd/system/llamacuda.service /usr/lib/systemd/system/llamacublas.service
%config /etc/sysconfig/llama %config /etc/sysconfig/llama
%pre %pre

View file

@ -1,5 +1,5 @@
# SRPM for building from source and packaging an RPM for RPM-based distros. # SRPM for building from source and packaging an RPM for RPM-based distros.
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com # Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
@ -38,9 +38,9 @@ make -j
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli cp -p main %{buildroot}%{_bindir}/llama
cp -p llama-server %{buildroot}%{_bindir}/llama-server cp -p server %{buildroot}%{_bindir}/llamaserver
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple cp -p simple %{buildroot}%{_bindir}/llamasimple
mkdir -p %{buildroot}/usr/lib/systemd/system mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=/etc/sysconfig/llama EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llama-server $LLAMA_ARGS ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID ExecReload=/bin/kill -s HUP $MAINPID
Restart=never Restart=never
@ -69,9 +69,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/* rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama-cli %{_bindir}/llama
%{_bindir}/llama-server %{_bindir}/llamaserver
%{_bindir}/llama-simple %{_bindir}/llamasimple
/usr/lib/systemd/system/llama.service /usr/lib/systemd/system/llama.service
%config /etc/sysconfig/llama %config /etc/sysconfig/llama

View file

@ -0,0 +1,32 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]

View file

@ -0,0 +1,28 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target main
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/main /main
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View file

@ -0,0 +1,45 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT [ "/app/main" ]

View file

@ -0,0 +1,29 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Build it
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target main
# Clean up
WORKDIR /
RUN cp /app/build/bin/main /main && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

20
.devops/main.Dockerfile Normal file
View file

@ -0,0 +1,20 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/main /main
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View file

@ -1,108 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y \
build-essential \
cmake \
python3 \
python3-pip \
git \
libcurl4-openssl-dev \
libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -6,10 +6,11 @@
let let
inherit (config.packages) default; inherit (config.packages) default;
binaries = [ binaries = [
"llama-cli" "llama"
"llama-embedding" "llama-embedding"
"llama-server" "llama-server"
"llama-quantize" "quantize"
"train-text-from-scratch"
]; ];
mkApp = name: { mkApp = name: {
type = "app"; type = "app";

View file

@ -1,52 +1,13 @@
{ inputs, ... }:
{ {
perSystem = perSystem =
{ { config, lib, ... }:
config,
lib,
system,
...
}:
{ {
devShells = devShells =
let lib.concatMapAttrs
pkgs = import inputs.nixpkgs { inherit system; }; (name: package: {
stdenv = pkgs.stdenv; ${name} = package.passthru.shell;
scripts = config.packages.python-scripts; ${name + "-extra"} = package.passthru.shell-extra;
in })
lib.pipe (config.packages) [ config.packages;
(lib.concatMapAttrs (
name: package: {
${name} = pkgs.mkShell {
name = "${name}";
inputsFrom = [ package ];
shellHook = ''
echo "Entering ${name} devShell"
'';
};
"${name}-extra" =
if (name == "python-scripts") then
null
else
pkgs.mkShell {
name = "${name}-extra";
inputsFrom = [
package
scripts
];
# Extra packages that *may* be used by some scripts
packages = [
pkgs.python3Packages.tiktoken
];
shellHook = ''
echo "Entering ${name} devShell"
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
'';
};
}
))
(lib.filterAttrs (name: value: value != null))
];
}; };
} }

View file

@ -26,14 +26,16 @@
config.cudaSupport = true; config.cudaSupport = true;
config.allowUnfreePredicate = config.allowUnfreePredicate =
p: p:
builtins.all ( builtins.all
(
license: license:
license.free license.free
|| builtins.elem license.shortName [ || builtins.elem license.shortName [
"CUDA EULA" "CUDA EULA"
"cuDNN EULA" "cuDNN EULA"
] ]
) (p.meta.licenses or [ p.meta.license ]); )
(p.meta.licenses or [ p.meta.license ]);
}; };
# Ensure dependencies use ROCm consistently # Ensure dependencies use ROCm consistently
pkgsRocm = import inputs.nixpkgs { pkgsRocm = import inputs.nixpkgs {

View file

@ -1,36 +0,0 @@
{
lib,
llamaVersion,
numpy,
tqdm,
sentencepiece,
pyyaml,
poetry-core,
buildPythonPackage,
pytestCheckHook,
}:
buildPythonPackage {
pname = "gguf";
version = llamaVersion;
pyproject = true;
nativeBuildInputs = [ poetry-core ];
propagatedBuildInputs = [
numpy
tqdm
sentencepiece
pyyaml
];
src = lib.cleanSource ../../gguf-py;
pythonImportsCheck = [
"numpy"
"gguf"
];
nativeCheckInputs = [ pytestCheckHook ];
doCheck = true;
meta = with lib; {
description = "Python package for writing binary files in the GGUF format";
license = licenses.mit;
maintainers = [ maintainers.ditsuke ];
};
}

View file

@ -1,47 +1,36 @@
{ {
lib, lib,
glibc,
config, config,
stdenv, stdenv,
runCommand, mkShell,
cmake, cmake,
ninja, ninja,
pkg-config, pkg-config,
git, git,
python3,
mpi, mpi,
blas, openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
cudaPackages, cudaPackages,
autoAddDriverRunpath,
darwin, darwin,
rocmPackages, rocmPackages,
vulkan-headers, vulkan-headers,
vulkan-loader, vulkan-loader,
curl, clblast,
shaderc, useBlas ? builtins.all (x: !x) [
useBlas ?
builtins.all (x: !x) [
useCuda useCuda
useMetalKit useMetalKit
useOpenCL
useRocm useRocm
useVulkan useVulkan
] ],
&& blas.meta.available,
useCuda ? config.cudaSupport, useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin, useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
# Increases the runtime closure size by ~700M useMpi ? false, # Increases the runtime closure size by ~700M
useMpi ? false, useOpenCL ? false,
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false, useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
}@inputs:
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
precompileMetalShaders ? false,
}:
let let
inherit (lib) inherit (lib)
@ -49,29 +38,51 @@ let
cmakeFeature cmakeFeature
optionals optionals
strings strings
versionOlder
; ;
# It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream.
stdenv = throw "Use effectiveStdenv instead"; stdenv = throw "Use effectiveStdenv instead";
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
suffices = suffices =
lib.optionals useBlas [ "BLAS" ] lib.optionals useBlas [ "BLAS" ]
++ lib.optionals useCuda [ "CUDA" ] ++ lib.optionals useCuda [ "CUDA" ]
++ lib.optionals useMetalKit [ "MetalKit" ] ++ lib.optionals useMetalKit [ "MetalKit" ]
++ lib.optionals useMpi [ "MPI" ] ++ lib.optionals useMpi [ "MPI" ]
++ lib.optionals useOpenCL [ "OpenCL" ]
++ lib.optionals useRocm [ "ROCm" ] ++ lib.optionals useRocm [ "ROCm" ]
++ lib.optionals useVulkan [ "Vulkan" ]; ++ lib.optionals useVulkan [ "Vulkan" ];
pnameSuffix = pnameSuffix =
strings.optionalString (suffices != [ ]) strings.optionalString (suffices != [ ])
"-${strings.concatMapStringsSep "-" strings.toLower suffices}"; "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
descriptionSuffix = strings.optionalString ( descriptionSuffix =
suffices != [ ] strings.optionalString (suffices != [ ])
) ", accelerated with ${strings.concatStringsSep ", " suffices}"; ", accelerated with ${strings.concatStringsSep ", " suffices}";
xcrunHost = runCommand "xcrunHost" { } '' # TODO: package the Python in this repository in a Nix-like way.
mkdir -p $out/bin # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
ln -s /usr/bin/xcrun $out/bin # is PEP 517-compatible, and ensure the correct .dist-info is generated.
''; # https://peps.python.org/pep-0517/
llama-python = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
]
);
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
llama-python-extra = python3.withPackages (
ps: [
ps.numpy
ps.sentencepiece
ps.tiktoken
ps.torchWithoutCuda
ps.transformers
]
);
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64 # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
# separately # separately
@ -85,9 +96,16 @@ let
++ optionals useMetalKit [ MetalKit ]; ++ optionals useMetalKit [ MetalKit ];
cudaBuildInputs = with cudaPackages; [ cudaBuildInputs = with cudaPackages; [
cuda_cudart cuda_cccl.dev # <nv/target>
cuda_cccl # <nv/target>
libcublas # A temporary hack for reducing the closure size, remove once cudaPackages
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
cuda_cudart.dev
cuda_cudart.lib
cuda_cudart.static
libcublas.dev
libcublas.lib
libcublas.static
]; ];
rocmBuildInputs = with rocmPackages; [ rocmBuildInputs = with rocmPackages; [
@ -99,11 +117,11 @@ let
vulkanBuildInputs = [ vulkanBuildInputs = [
vulkan-headers vulkan-headers
vulkan-loader vulkan-loader
shaderc
]; ];
in in
effectiveStdenv.mkDerivation (finalAttrs: { effectiveStdenv.mkDerivation (
finalAttrs: {
pname = "llama-cpp${pnameSuffix}"; pname = "llama-cpp${pnameSuffix}";
version = llamaVersion; version = llamaVersion;
@ -127,19 +145,14 @@ effectiveStdenv.mkDerivation (finalAttrs: {
}; };
postPatch = '' postPatch = ''
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \ substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
'';
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, # TODO: Package up each Python script or service appropriately.
# `default.metallib` may be compiled with Metal compiler from XCode # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
# and we need to escape sandbox on MacOS to access Metal compiler. # we could make those *.py into setuptools' entrypoints
# `xcrun` is used find the path of the Metal compiler, which is varible substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
# and not on $PATH '';
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
nativeBuildInputs = nativeBuildInputs =
[ [
@ -151,33 +164,32 @@ effectiveStdenv.mkDerivation (finalAttrs: {
++ optionals useCuda [ ++ optionals useCuda [
cudaPackages.cuda_nvcc cudaPackages.cuda_nvcc
autoAddDriverRunpath # TODO: Replace with autoAddDriverRunpath
] # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ] cudaPackages.autoAddOpenGLRunpathHook
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ]; ];
buildInputs = buildInputs =
optionals effectiveStdenv.isDarwin darwinBuildInputs optionals effectiveStdenv.isDarwin darwinBuildInputs
++ optionals useCuda cudaBuildInputs ++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ] ++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs ++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ] ++ optionals useVulkan vulkanBuildInputs;
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];
cmakeFlags = cmakeFlags =
[ [
(cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "BUILD_SHARED_LIBS" true)
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl) (cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "GGML_NATIVE" false) (cmakeBool "LLAMA_CLBLAST" useOpenCL)
(cmakeBool "GGML_BLAS" useBlas) (cmakeBool "LLAMA_CUBLAS" useCuda)
(cmakeBool "GGML_CUDA" useCuda) (cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "GGML_HIP" useRocm) (cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "GGML_METAL" useMetalKit) (cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "GGML_VULKAN" useVulkan) (cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic)
] ]
++ optionals useCuda [ ++ optionals useCuda [
( (
@ -188,32 +200,62 @@ effectiveStdenv.mkDerivation (finalAttrs: {
) )
] ]
++ optionals useRocm [ ++ optionals useRocm [
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets) (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
]
++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
];
# Environment variables needed for ROCm # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
env = optionals useRocm { # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
ROCM_PATH = "${rocmPackages.clr}"; # and select the line that matches the current nixpkgs version of rocBLAS.
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode"; # Should likely use `rocmPackages.clr.gpuTargets`.
}; "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
]
++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet. # if they haven't been added yet.
postInstall = '' postInstall = ''
mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server
mkdir -p $out/include mkdir -p $out/include
cp $src/include/llama.h $out/include/ cp $src/llama.h $out/include/
''; '';
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
passthru = {
inherit
useBlas
useCuda
useMetalKit
useMpi
useOpenCL
useRocm
useVulkan
;
shell = mkShell {
name = "shell-${finalAttrs.finalPackage.name}";
description = "contains numpy and sentencepiece";
buildInputs = [ llama-python ];
inputsFrom = [ finalAttrs.finalPackage ];
shellHook = ''
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
'';
};
shell-extra = mkShell {
name = "shell-extra-${finalAttrs.finalPackage.name}";
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
buildInputs = [ llama-python-extra ];
inputsFrom = [ finalAttrs.finalPackage ];
};
};
meta = { meta = {
# Configurations we don't want even the CI to evaluate. Results in the # Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because # "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway. # cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals useCuda lib.platforms.darwin; badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
# Configurations that are known to result in build failures. Can be # Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`. # overridden by importing Nixpkgs with `allowBroken = true`.
@ -224,7 +266,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
license = lib.licenses.mit; license = lib.licenses.mit;
# Accommodates `nix run` and `lib.getExe` # Accommodates `nix run` and `lib.getExe`
mainProgram = "llama-cli"; mainProgram = "llama";
# These people might respond, on the best effort basis, if you ping them # These people might respond, on the best effort basis, if you ping them
# in case of Nix-specific regressions or for reviewing Nix-specific PRs. # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
@ -244,4 +286,5 @@ effectiveStdenv.mkDerivation (finalAttrs: {
# Extend `badPlatforms` instead # Extend `badPlatforms` instead
platforms = lib.platforms.all; platforms = lib.platforms.all;
}; };
}) }
)

View file

@ -1,66 +0,0 @@
{
lib,
stdenv,
buildPythonPackage,
poetry-core,
mkShell,
python3Packages,
gguf-py,
}@inputs:
let
llama-python-deps = with python3Packages; [
numpy
sentencepiece
transformers
protobuf
torchWithoutCuda
gguf-py
tqdm
# for scripts/compare-llama-bench.py
gitpython
tabulate
# for examples/pydantic-models-to-grammar-examples.py
docstring-parser
pydantic
];
llama-python-test-deps = with python3Packages; [
# Server bench
matplotlib
# server tests
openai
pytest
prometheus-client
];
in
buildPythonPackage ({
pname = "llama-scripts";
version = "0.0.0";
pyproject = true;
# NOTE: The files filtered out here are not visible in the build sandbox, neither
# do they affect the output hash. They can be modified without triggering a rebuild.
src = lib.cleanSourceWith {
filter =
name: type:
let
any = builtins.any (x: x);
baseName = builtins.baseNameOf name;
in
any [
(lib.hasSuffix ".py" name)
(baseName == "README.md")
(baseName == "pyproject.toml")
];
src = lib.cleanSource ../../.;
};
nativeBuildInputs = [ poetry-core ];
nativeCheckInputs = llama-python-test-deps;
dependencies = llama-python-deps;
})

View file

@ -1,41 +1,19 @@
{ {
lib, lib,
newScope, newScope,
python3,
llamaVersion ? "0.0.0", llamaVersion ? "0.0.0",
}: }:
let
pythonPackages = python3.pkgs;
buildPythonPackage = pythonPackages.buildPythonPackage;
numpy = pythonPackages.numpy;
tqdm = pythonPackages.tqdm;
sentencepiece = pythonPackages.sentencepiece;
pyyaml = pythonPackages.pyyaml;
poetry-core = pythonPackages.poetry-core;
pytestCheckHook = pythonPackages.pytestCheckHook;
in
# We're using `makeScope` instead of just writing out an attrset # We're using `makeScope` instead of just writing out an attrset
# because it allows users to apply overlays later using `overrideScope'`. # because it allows users to apply overlays later using `overrideScope'`.
# Cf. https://noogle.dev/f/lib/makeScope # Cf. https://noogle.dev/f/lib/makeScope
lib.makeScope newScope (self: { lib.makeScope newScope (
self: {
inherit llamaVersion; inherit llamaVersion;
gguf-py = self.callPackage ./package-gguf-py.nix {
inherit
buildPythonPackage
numpy
tqdm
sentencepiece
poetry-core
pyyaml
pytestCheckHook
;
};
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
llama-cpp = self.callPackage ./package.nix { }; llama-cpp = self.callPackage ./package.nix { };
docker = self.callPackage ./docker.nix { }; docker = self.callPackage ./docker.nix { };
docker-min = self.callPackage ./docker.nix { interactive = false; }; docker-min = self.callPackage ./docker.nix { interactive = false; };
sif = self.callPackage ./sif.nix { }; sif = self.callPackage ./sif.nix { };
}) }
)

View file

@ -1,113 +0,0 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=6.3
ARG AMDGPU_VERSION=6.3
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
### Build image
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
# gfx906 is deprecated
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
ARG ROCM_DOCKER_ARCH=gfx1100
# Set nvcc architectured
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
# ENV CC=/opt/rocm/llvm/bin/clang
# ENV CXX=/opt/rocm/llvm/bin/clang++
RUN apt-get update \
&& apt-get install -y \
build-essential \
cmake \
git \
libcurl4-openssl-dev \
curl \
libgomp1
WORKDIR /app
COPY . .
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
&& cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib \
&& find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3-pip \
python3 \
python3-wheel\
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -0,0 +1,32 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
COPY --from=build /app/server /server
ENTRYPOINT [ "/server" ]

View file

@ -0,0 +1,28 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View file

@ -0,0 +1,45 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT [ "/app/server" ]

View file

@ -0,0 +1,29 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Build it
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target server
# Clean up
WORKDIR /
RUN cp /app/build/bin/server /server && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

20
.devops/server.Dockerfile Normal file
View file

@ -0,0 +1,20 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View file

@ -8,40 +8,36 @@ arg1="$1"
shift shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
exec python3 ./convert_hf_to_gguf.py "$@" python3 ./convert.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
exec ./llama-quantize "$@" ./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
exec ./llama-cli "$@" ./main "$@"
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
exec ./llama-bench "$@" ./finetune "$@"
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
exec ./llama-perplexity "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in $(ls $1/$2/ggml-model-f16.bin*); do for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then if [ -f "${i/f16/q4_0}" ]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}" echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 ./quantize "$i" "${i/f16/q4_0}" q4_0
fi fi
done done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
exec ./llama-server "$@" ./server "$@"
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml" echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
echo " ex: -m model.gguf"
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
echo " ex: -m model.gguf -f file.txt"
echo " --convert (-c): Convert a llama model into ggml" echo " --convert (-c): Convert a llama model into ggml"
echo " ex: --outtype f16 \"/models/7B/\" " echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml" echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
echo " See documentation for finetune for command-line parameters"
echo " --all-in-one (-a): Execute --convert & --quantize" echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B" echo " ex: \"/models/\" 7B"
echo " --server (-s): Run a model on the server" echo " --server (-s): Run a model on the server"

View file

@ -1,89 +0,0 @@
ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK and cURL
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt update -y && \
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl libvulkan-dev \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -1,7 +1,7 @@
*.o *.o
*.a *.a
.cache/ .cache/
# Do not ignore .git directory, otherwise the reported build number will always be 0 .git/
.github/ .github/
.gitignore .gitignore
.vs/ .vs/
@ -12,8 +12,8 @@ build*/
models/* models/*
/llama-cli /main
/llama-quantize /quantize
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

2
.ecrc
View file

@ -1,5 +1,5 @@
{ {
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"], "Exclude": ["^\\.gitmodules$"],
"Disable": { "Disable": {
"IndentSize": true "IndentSize": true
} }

View file

@ -24,27 +24,5 @@ insert_final_newline = unset
[examples/server/public/*] [examples/server/public/*]
indent_size = 2 indent_size = 2
[examples/server/public/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
[examples/server/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
[examples/llama.swiftui/llama.swiftui.xcodeproj/*] [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab indent_style = tab
[examples/cvector-generator/*.txt]
trim_trailing_whitespace = unset
insert_final_newline = unset
[models/templates/*.jinja]
indent_style = unset
indent_size = unset
end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset

16
.flake8
View file

@ -1,17 +1,3 @@
[flake8] [flake8]
max-line-length = 125 max-line-length = 125
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503 ignore = W503
exclude =
# Do not traverse examples
examples,
# Do not include package initializers
__init__.py,
# No need to traverse our git directory
.git,
# There's no value in checking cache directories
__pycache__,
# No need to include the build path
build,
# This contains builds that we don't want to check
dist # This is generated with `python build .` for package releases
# max-complexity = 10

View file

@ -1,87 +0,0 @@
name: Bug (compilation)
description: Something goes wrong when trying to compile llama.cpp.
title: "Compile bug: "
labels: ["bug-unconfirmed", "compilation"]
body:
- type: markdown
attributes:
value: >
Thanks for taking the time to fill out this bug report!
This issue template is intended for bug reports where the compilation of llama.cpp fails.
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
by clearing `~/.cache/ccache` (on Linux).
- type: textarea
id: commit
attributes:
label: Git commit
description: Which commit are you trying to compile?
placeholder: |
$git rev-parse HEAD
84a07a17b1b08cf2b9747c633a2372782848a27f
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: true
- type: dropdown
id: backends
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true
validations:
required: true
- type: textarea
id: info
attributes:
label: Problem description & steps to reproduce
description: >
Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
placeholder: >
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
Here are the exact commands that I used: ...
validations:
required: true
- type: textarea
id: first_bad_commit
attributes:
label: First Bad Commit
description: >
If the bug was not present on an earlier version: when did it start appearing?
If possible, please do a git bisect and identify the exact commit that introduced the bug.
validations:
required: false
- type: textarea
id: command
attributes:
label: Compile command
description: >
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: true
- type: textarea
id: logs
attributes:
label: Relevant log output
description: >
Please copy and paste any relevant log output, including any generated text.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: true

View file

@ -1,101 +0,0 @@
name: Bug (model use)
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
title: "Eval bug: "
labels: ["bug-unconfirmed", "model evaluation"]
body:
- type: markdown
attributes:
value: >
Thanks for taking the time to fill out this bug report!
This issue template is intended for bug reports where the model evaluation results
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
If you encountered the issue while using an external UI (e.g. ollama),
please reproduce your issue using one of the examples/binaries in this repository.
The `llama-cli` binary can be used for simple and reproducible model inference.
- type: textarea
id: version
attributes:
label: Name and Version
description: Which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: true
- type: dropdown
id: backends
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true
validations:
required: true
- type: textarea
id: hardware
attributes:
label: Hardware
description: Which CPUs/GPUs are you using?
placeholder: >
e.g. Ryzen 5950X + 2x RTX 4090
validations:
required: true
- type: textarea
id: model
attributes:
label: Models
description: >
Which model(s) at which quantization were you using when encountering the bug?
If you downloaded a GGUF file off of Huggingface, please provide a link.
placeholder: >
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
validations:
required: false
- type: textarea
id: info
attributes:
label: Problem description & steps to reproduce
description: >
Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
that information would be very much appreciated by us.
placeholder: >
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
When I use -ngl 0 it works correctly.
Here are the exact commands that I used: ...
validations:
required: true
- type: textarea
id: first_bad_commit
attributes:
label: First Bad Commit
description: >
If the bug was not present on an earlier version: when did it start appearing?
If possible, please do a git bisect and identify the exact commit that introduced the bug.
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: >
Please copy and paste any relevant log output, including the command that you entered and any generated text.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: true

View file

@ -1,91 +0,0 @@
name: Bug (misc.)
description: Something is not working the way it should (and it's not covered by any of the above cases).
title: "Misc. bug: "
labels: ["bug-unconfirmed"]
body:
- type: markdown
attributes:
value: >
Thanks for taking the time to fill out this bug report!
This issue template is intended for miscellaneous bugs that don't fit into any other category.
If you encountered the issue while using an external UI (e.g. ollama),
please reproduce your issue using one of the examples/binaries in this repository.
- type: textarea
id: version
attributes:
label: Name and Version
description: Which version of our software is affected? (You can use `--version` to get a version string.)
placeholder: |
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: dropdown
id: module
attributes:
label: Which llama.cpp modules do you know to be affected?
multiple: true
options:
- Documentation/Github
- libllama (core library)
- llama-cli
- llama-server
- llama-bench
- llama-quantize
- Python/Bash scripts
- Test code
- Other (Please specify in the next section)
validations:
required: false
- type: textarea
id: command
attributes:
label: Command line
description: >
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: false
- type: textarea
id: info
attributes:
label: Problem description & steps to reproduce
description: >
Please give us a summary of the problem and tell us how to reproduce it (if applicable).
validations:
required: true
- type: textarea
id: first_bad_commit
attributes:
label: First Bad Commit
description: >
If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
If possible, please do a git bisect and identify the exact commit that introduced the bug.
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: >
If applicable, please copy and paste any relevant log output, including any generated text.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: false

View file

@ -1,51 +0,0 @@
name: Enhancement
description: Used to request enhancements for llama.cpp.
title: "Feature Request: "
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
- type: checkboxes
id: prerequisites
attributes:
label: Prerequisites
description: Please confirm the following before submitting your enhancement request.
options:
- label: I am running the latest code. Mention the version if possible as well.
required: true
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
required: true
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
required: true
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
required: true
- type: textarea
id: feature-description
attributes:
label: Feature Description
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
placeholder: Detailed description of the enhancement
validations:
required: true
- type: textarea
id: motivation
attributes:
label: Motivation
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
placeholder: Explanation of why this feature is needed and its benefits
validations:
required: true
- type: textarea
id: possible-implementation
attributes:
label: Possible Implementation
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
placeholder: Detailed description of potential implementation
validations:
required: false

View file

@ -1,52 +0,0 @@
name: Research
description: Track new technical research area.
title: "Research: "
labels: ["research 🔬"]
body:
- type: markdown
attributes:
value: |
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
- type: checkboxes
id: research-stage
attributes:
label: Research Stage
description: Track general state of this research ticket
options:
- label: Background Research (Let's try to avoid reinventing the wheel)
- label: Hypothesis Formed (How do you think this will work and it's effect?)
- label: Strategy / Implementation Forming
- label: Analysis of results
- label: Debrief / Documentation (So people in the future can learn from us)
- type: textarea
id: background
attributes:
label: Previous existing literature and research
description: Whats the current state of the art and whats the motivation for this research?
- type: textarea
id: hypothesis
attributes:
label: Hypothesis
description: How do you think this will work and it's effect?
- type: textarea
id: implementation
attributes:
label: Implementation
description: Got an approach? e.g. a PR ready to go?
- type: textarea
id: analysis
attributes:
label: Analysis
description: How does the proposed implementation behave?
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -1,28 +0,0 @@
name: Refactor (Maintainers)
description: Used to track refactoring opportunities.
title: "Refactor: "
labels: ["refactor"]
body:
- type: markdown
attributes:
value: |
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
- type: textarea
id: background-description
attributes:
label: Background Description
description: Please provide a detailed written description of the pain points you are trying to solve.
placeholder: Detailed description behind your motivation to request refactor
validations:
required: true
- type: textarea
id: possible-approaches
attributes:
label: Possible Refactor Approaches
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
placeholder: Your idea of possible refactoring opportunity/approaches
validations:
required: false

11
.github/ISSUE_TEMPLATE/bug.md vendored Normal file
View file

@ -0,0 +1,11 @@
---
name: Bug template
about: Used to report bugs in llama.cpp
labels: ["bug-unconfirmed"]
assignees: ''
---
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).

View file

@ -1,11 +0,0 @@
blank_issues_enabled: true
contact_links:
- name: Got an idea?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
about: Pop it there. It may then become an enhancement ticket.
- name: Got a question?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
about: Ask a question there!
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with

28
.github/ISSUE_TEMPLATE/enhancement.md vendored Normal file
View file

@ -0,0 +1,28 @@
---
name: Enhancement template
about: Used to request enhancements for llama.cpp
labels: ["enhancement"]
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Feature Description
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
# Motivation
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
# Possible Implementation
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

86
.github/labeler.yml vendored
View file

@ -1,86 +0,0 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute/**
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-metal.h
- ggml/src/ggml-metal/**
- README-metal.md
SYCL:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-sycl.h
- ggml/src/ggml-sycl/**
- docs/backend/SYCL.md
- examples/sycl/**
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-cuda.h
- ggml/src/ggml-cuda/**
Vulkan:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-vulkan.h
- ggml/src/ggml-vulkan/**
documentation:
- changed-files:
- any-glob-to-any-file:
- docs/**
- media/**
testing:
- changed-files:
- any-glob-to-any-file:
- tests/**
build:
- changed-files:
- any-glob-to-any-file:
- cmake/**
- CMakeLists.txt
- CMakePresets.json
examples:
- changed-files:
- any-glob-to-any-file: examples/**
devops:
- changed-files:
- any-glob-to-any-file:
- .devops/**
- .github/**
- ci/**
python:
- changed-files:
- any-glob-to-any-file:
- "**/*.py"
- requirements/**
- gguf-py/**
- .flake8
script:
- changed-files:
- any-glob-to-any-file:
- scripts/**
android:
- changed-files:
- any-glob-to-any-file:
- examples/llama.android/**
server:
- changed-files:
- any-glob-to-any-file:
- examples/server/**
ggml:
- changed-files:
- any-glob-to-any-file:
- ggml/**
nix:
- changed-files:
- any-glob-to-any-file:
- "**/*.nix"
- .github/workflows/nix-*.yml
- .devops/nix/nixpkgs-instances.nix
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/

View file

@ -1 +0,0 @@
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*

View file

@ -1,315 +0,0 @@
# TODO: there have been some issues with the workflow, so disabling for now
# https://github.com/ggerganov/llama.cpp/issues/7893
#
# Benchmark
name: Benchmark
on:
workflow_dispatch:
inputs:
gpu-series:
description: 'Azure GPU series to run with'
required: true
type: choice
options:
- Standard_NC4as_T4_v3
- Standard_NC24ads_A100_v4
- Standard_NC80adis_H100_v5
sha:
description: 'Commit SHA1 to build'
required: false
type: string
duration:
description: 'Duration of the bench'
type: string
default: 10m
push:
branches:
- master
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
pull_request_target:
types: [opened, synchronize, reopened]
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
schedule:
- cron: '04 2 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
cancel-in-progress: true
jobs:
bench-server-baseline:
runs-on: Standard_NC4as_T4_v3
env:
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
N_USERS: 8
DURATION: 10m
strategy:
matrix:
model: [phi-2]
ftype: [q4_0, q8_0, f16]
include:
- model: phi-2
ftype: q4_0
pr_comment_enabled: "true"
if: |
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|| (
github.event_name == 'schedule'
&& github.ref_name == 'master'
&& github.repository_owner == 'ggerganov'
)
|| github.event_name == 'pull_request_target'
|| (
github.event_name == 'push'
&& github.event.ref == 'refs/heads/master'
&& github.repository_owner == 'ggerganov'
)
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Install python env
id: pipenv
run: |
cd examples/server/bench
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
- name: Prometheus
id: install_prometheus
run: |
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
tar xzf prometheus*.tar.gz --strip-components=1
./prometheus --config.file=examples/server/bench/prometheus.yml &
while ! nc -z localhost 9090; do
sleep 0.1
done
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install k6 and xk6-sse
id: k6_installation
run: |
cd examples/server/bench
go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \
--with github.com/phymbert/xk6-sse
- name: Build
id: cmake_build
run: |
set -eux
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DLLAMA_CUBLAS=ON \
-DCUDAToolkit_ROOT=/usr/local/cuda \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-DCMAKE_CUDA_ARCHITECTURES=75 \
-DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release;
cmake --build build --config Release -j $(nproc) --target llama-server
- name: Download the dataset
id: download_dataset
run: |
cd examples/server/bench
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
- name: Server bench
id: server_bench
env:
HEAD_REF: ${{ github.head_ref || github.ref_name }}
run: |
set -eux
cd examples/server/bench
source venv/bin/activate
python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \
--branch $HEAD_REF \
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
--scenario script.js \
--duration ${{ github.event.inputs.duration || env.DURATION }} \
--hf-repo ggml-org/models \
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
--model-path-prefix /models \
--parallel ${{ env.N_USERS }} \
-ngl 33 \
--batch-size 2048 \
--ubatch-size 256 \
--ctx-size 16384 \
--n-prompts 1000 \
--max-prompt-tokens 1024 \
--max-tokens 2048
cat results.github.env >> $GITHUB_ENV
# Remove dataset as we do not want it in the artefact
rm ShareGPT_V3_unfiltered_cleaned_split.json
- uses: actions/upload-artifact@v4
with:
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
compression-level: 9
path: |
examples/server/bench/*.jpg
examples/server/bench/*.json
examples/server/bench/*.log
- name: Commit status
uses: Sibz/github-status-action@v1
with:
authToken: ${{secrets.GITHUB_TOKEN}}
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
description: |
${{ env.BENCH_RESULTS }}
state: 'success'
- name: Upload benchmark images
uses: devicons/public-upload-to-imgur@v2.2.2
continue-on-error: true # Important as it looks unstable: 503
id: imgur_step
with:
client_id: ${{secrets.IMGUR_CLIENT_ID}}
path: |
examples/server/bench/prompt_tokens_seconds.jpg
examples/server/bench/predicted_tokens_seconds.jpg
examples/server/bench/kv_cache_usage_ratio.jpg
examples/server/bench/requests_processing.jpg
- name: Extract mermaid
id: set_mermaid
run: |
set -eux
cd examples/server/bench
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
- name: Extract image url
id: extract_image_url
continue-on-error: true
run: |
set -eux
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
- name: Comment PR
uses: mshick/add-pr-comment@v2
id: comment_pr
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
with:
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
message: |
<p align="center">
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
</p>
<details>
<summary>Expand details for performance related PR only</summary>
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
- ${{ env.BENCH_GRAPH_XLABEL }}
<p align="center">
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
<details>
<summary>More</summary>
```mermaid
${{ env.PROMPT_TOKENS_SECONDS }}
```
</details>
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
<details>
<summary>More</summary>
```mermaid
${{ env.PREDICTED_TOKENS_SECONDS }}
```
</details>
</p>
<details>
<summary>Details</summary>
<p align="center">
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
<details>
<summary>More</summary>
```mermaid
${{ env.KV_CACHE_USAGE_RATIO }}
```
</details>
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
<details>
<summary>More</summary>
```mermaid
${{ env.REQUESTS_PROCESSING }}
```
</details>
</p>
</details>
</details>

File diff suppressed because it is too large Load diff

View file

@ -1,28 +0,0 @@
name: Close inactive issues
on:
schedule:
- cron: "42 0 * * *"
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
issues: write
jobs:
close-issues:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v5
with:
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
operations-per-run: 10000
repo-token: ${{ secrets.GITHUB_TOKEN }}

36
.github/workflows/code-coverage.yml vendored Normal file
View file

@ -0,0 +1,36 @@
name: Code Coverage
on: [push, pull_request]
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
jobs:
run:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 lcov
- name: Build
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
- name: Run tests
run: CC=gcc-8 make test
- name: Generate coverage report
run: |
make coverage
make lcov-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
with:
files: lcov-report/coverage.info

View file

@ -10,50 +10,45 @@
name: Publish Docker image name: Publish Docker image
on: on:
workflow_dispatch: # allows manual triggering pull_request:
schedule: push:
# Rebuild daily rather than on every push because it is expensive branches:
- cron: '12 4 * * *' - master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
packages: write
jobs: jobs:
push_to_registry: push_to_registry:
name: Push Docker image to Docker Hub name: Push Docker image to Docker Hub
if: github.event.pull_request.draft == false
runs-on: ubuntu-22.04 runs-on: ubuntu-latest
env: env:
COMMIT_SHA: ${{ github.sha }} COMMIT_SHA: ${{ github.sha }}
strategy: strategy:
fail-fast: false
matrix: matrix:
config: config:
# Multi-stage build - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} # have disabled them for now until the reason why
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} # is understood.
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v4 uses: actions/checkout@v3
with:
fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v3 uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx - name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 uses: docker/setup-buildx-action@v2
- name: Log in to Docker Hub - name: Log in to Docker Hub
uses: docker/login-action@v2 uses: docker/login-action@v2
@ -62,45 +57,9 @@ jobs:
username: ${{ github.repository_owner }} username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine tag name # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
# determine tag name postfix (build number, commit hash)
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
TAG_POSTFIX="-b${BUILD_NUMBER}"
else
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
fi
# list all tags possible
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
TYPE=""
else
TYPE="-${{ matrix.config.tag }}"
fi
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
echo "full_output_tags=$FULLTAGS" # print out for debugging
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
echo "server_output_tags=$SERVERTAGS" # print out for debugging
env:
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Free Disk Space (Ubuntu) - name: Free Disk Space (Ubuntu)
if: ${{ matrix.config.free_disk_space == true }} uses: jlumbroso/free-disk-space@main
uses: ggml-org/free-disk-space@v1.3.1
with: with:
# this might remove tools that are actually needed, # this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB # if set to "true" but frees about 6 GB
@ -115,59 +74,34 @@ jobs:
docker-images: true docker-images: true
swap-storage: true swap-storage: true
- name: Build and push Full Docker image (tagged + versioned) - name: Determine tag name
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} id: tag
uses: docker/build-push-action@v6 shell: bash
with: run: |
context: . BUILD_NUMBER="$(git rev-list --count HEAD)"
push: true SHORT_HASH="$(git rev-parse --short=7 HEAD)"
platforms: ${{ matrix.config.platforms }} if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
# tag list is generated from step above echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
tags: ${{ steps.tag.outputs.full_output_tags }} else
file: ${{ matrix.config.dockerfile }} SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
target: full echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
provenance: false fi
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
- name: Build and push Light Docker image (tagged + versioned) - name: Build and push Docker image (versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} if: github.event_name == 'push'
uses: docker/build-push-action@v6 uses: docker/build-push-action@v4
with: with:
context: . context: .
push: true push: true
platforms: ${{ matrix.config.platforms }} platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
tags: ${{ steps.tag.outputs.light_output_tags }}
file: ${{ matrix.config.dockerfile }} file: ${{ matrix.config.dockerfile }}
target: light
provenance: false
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
- name: Build and push Server Docker image (tagged + versioned) - name: Build and push Docker image (tagged)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} uses: docker/build-push-action@v4
uses: docker/build-push-action@v6
with: with:
context: . context: .
push: true push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }} platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
tags: ${{ steps.tag.outputs.server_output_tags }}
file: ${{ matrix.config.dockerfile }} file: ${{ matrix.config.dockerfile }}
target: server
provenance: false
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache

View file

@ -14,16 +14,10 @@ on:
branches: branches:
- master - master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs: jobs:
editorconfig: editorconfig:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- uses: editorconfig-checker/action-editorconfig-checker@v2 - uses: editorconfig-checker/action-editorconfig-checker@main
with:
version: v3.0.3
- run: editorconfig-checker - run: editorconfig-checker

View file

@ -24,9 +24,9 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v2
with: with:
python-version: '3.9.x' python-version: '3.9.x'
- name: Install dependencies - name: Install dependencies

View file

@ -1,17 +0,0 @@
name: "Pull Request Labeler"
on:
- pull_request_target
jobs:
labeler:
permissions:
contents: read
pull-requests: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
repository: "ggerganov/llama.cpp"
- uses: actions/labeler@v5
with:
configuration-path: '.github/labeler.yml'

61
.github/workflows/nix-ci-aarch64.yml vendored Normal file
View file

@ -0,0 +1,61 @@
name: Nix aarch64 builds
on:
workflow_dispatch: # allows manual triggering
schedule:
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
# 1.5h instead of minutes with the cold cache).
#
# randint(0, 59), randint(0, 23)
- cron: '26 12 * * *'
# But also rebuild if we touched any of the Nix expressions:
push:
branches:
- master
paths: ['**/*.nix', 'flake.lock']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/*.nix', 'flake.lock']
jobs:
nix-build-aarch64:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install QEMU
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
run: |
sudo apt-get update
sudo apt-get install -y qemu-user-static qemu-system-aarch64
sudo usermod -a -G kvm $USER
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: llama-cpp
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.aarch64-linux"
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--systems aarch64-linux
--flake
".#checks.aarch64-linux"

68
.github/workflows/nix-ci.yml vendored Normal file
View file

@ -0,0 +1,68 @@
name: Nix CI
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
pull_request:
types: [opened, synchronize, reopened]
jobs:
nix-eval:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: List all flake outputs
run: nix flake show --all-systems
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: llama-cpp
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--flake
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"

22
.github/workflows/nix-flake-update.yml vendored Normal file
View file

@ -0,0 +1,22 @@
name: update-flake-lock
on:
workflow_dispatch:
schedule:
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
jobs:
lockfile:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@main
- name: Update flake.lock
uses: DeterminateSystems/update-flake-lock@main
with:
pr-title: "nix: update flake.lock"
pr-labels: |
nix
pr-reviewers: philiptaron,SomeoneSerge
token: ${{ secrets.FLAKE_TOKEN }}

36
.github/workflows/nix-publish-flake.yml vendored Normal file
View file

@ -0,0 +1,36 @@
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
name: "Publish a flake to flakestry & flakehub"
on:
push:
tags:
- "*"
workflow_dispatch:
inputs:
tag:
description: "The existing tag to publish"
type: "string"
required: true
jobs:
flakestry-publish:
runs-on: ubuntu-latest
permissions:
id-token: "write"
contents: "read"
steps:
- uses: flakestry/flakestry-publish@main
with:
version: "${{ inputs.tag || github.ref_name }}"
flakehub-publish:
runs-on: "ubuntu-latest"
permissions:
id-token: "write"
contents: "read"
steps:
- uses: "actions/checkout@v4"
with:
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
- uses: "DeterminateSystems/nix-installer-action@main"
- uses: "DeterminateSystems/flakehub-push@main"
with:
visibility: "public"
tag: "${{ inputs.tag }}"

View file

@ -3,20 +3,16 @@ name: Python check requirements.txt
on: on:
push: push:
paths: paths:
- '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh' - 'scripts/check-requirements.sh'
- 'convert*.py' - 'convert*.py'
- '**/requirements*.txt' - 'requirements.txt'
- 'requirements/*.txt'
pull_request: pull_request:
paths: paths:
- '.github/workflows/python-check-requirements.yml'
- 'scripts/check-requirements.sh' - 'scripts/check-requirements.sh'
- 'convert*.py' - 'convert*.py'
- '**/requirements*.txt' - 'requirements.txt'
- 'requirements/*.txt'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs: jobs:
python-check-requirements: python-check-requirements:
@ -24,10 +20,10 @@ jobs:
name: check-requirements name: check-requirements
steps: steps:
- name: Check out source repository - name: Check out source repository
uses: actions/checkout@v4 uses: actions/checkout@v3
- name: Set up Python environment - name: Set up Python environment
uses: actions/setup-python@v5 uses: actions/setup-python@v4
with: with:
python-version: "3.11" python-version: "3.11"
- name: Run check-requirements.sh script - name: Run check-requirements.sh script
run: bash scripts/check-requirements.sh run: bash scripts/check-requirements.sh nocleanup

View file

@ -1,17 +1,6 @@
name: flake8 Lint name: flake8 Lint
on: on: [push, pull_request]
push:
branches:
- master
paths: ['.github/workflows/python-lint.yml', '**/*.py']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/python-lint.yml', '**/*.py']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs: jobs:
flake8-lint: flake8-lint:
@ -19,12 +8,13 @@ jobs:
name: Lint name: Lint
steps: steps:
- name: Check out source repository - name: Check out source repository
uses: actions/checkout@v4 uses: actions/checkout@v3
- name: Set up Python environment - name: Set up Python environment
uses: actions/setup-python@v5 uses: actions/setup-python@v4
with: with:
python-version: "3.11" python-version: "3.11"
- name: flake8 Lint - name: flake8 Lint
uses: py-actions/flake8@v2 uses: py-actions/flake8@v2
with: with:
plugins: "flake8-no-print" ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
exclude: "examples/*,examples/*/**,*/**/__init__.py"

View file

@ -1,40 +0,0 @@
name: Python Type-Check
on:
push:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- '**.py'
- '**/requirements*.txt'
pull_request:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- '**.py'
- '**/requirements*.txt'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
python-type-check:
runs-on: ubuntu-latest
name: pyright type-check
steps:
- name: Check out source repository
uses: actions/checkout@v4
- name: Set up Python environment
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install Python dependencies
# TODO: use a venv
run: pip install -r requirements/requirements-all.txt
- name: Type-check with Pyright
uses: jakebailey/pyright-action@v2
with:
version: 1.1.382
level: warning
warnings: true

View file

@ -3,32 +3,13 @@ name: Server
on: on:
workflow_dispatch: # allows manual triggering workflow_dispatch: # allows manual triggering
inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
env:
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs: jobs:
server: server:
@ -36,204 +17,67 @@ jobs:
strategy: strategy:
matrix: matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [RelWithDebInfo] build_type: [Debug, Release]
include: include:
- build_type: Release - build_type: Release
sanitizer: "" sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken exclude:
- build_type: Release
sanitizer: ADDRESS
- build_type: Release
sanitizer: THREAD
- build_type: Release
sanitizer: UNDEFINED
container:
image: ubuntu:latest
ports:
- 8888
options: --cpus 4
steps: steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
sudo apt-get update apt-get update
sudo apt-get -y install \ apt-get -y install \
build-essential \ build-essential \
xxd \
git \ git \
cmake \ cmake \
curl \ python3-pip \
wget \ wget \
language-pack-en \ psmisc
libcurl4-openssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
# Setup nodejs (to be used for verifying bundled index.html)
- uses: actions/setup-node@v4
with:
node-version: '22.11.0'
- name: WebUI - Install dependencies
id: webui_lint
run: |
cd examples/server/webui
npm ci
- name: WebUI - Check code format
id: webui_format
run: |
git config --global --add safe.directory $(realpath .)
cd examples/server/webui
git status
npm run format
git status
modified_files="$(git status -s)"
echo "Modified files: ${modified_files}"
if [ -n "${modified_files}" ]; then
echo "Files do not follow coding style. To fix: npm run format"
echo "${modified_files}"
exit 1
fi
- name: Verify bundled index.html
id: verify_server_index_html
run: |
git config --global --add safe.directory $(realpath .)
cd examples/server/webui
git status
npm run build
git status
modified_files="$(git status -s)"
echo "Modified files: ${modified_files}"
if [ -n "${modified_files}" ]; then
echo "Repository is dirty or server/webui is not built as expected"
echo "Hint: You may need to follow Web UI build guide in server/README.md"
echo "${modified_files}"
exit 1
fi
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
run: |
cd examples/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd examples/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd examples/server/tests
SLOW_TESTS=1 ./tests.sh
server-windows:
runs-on: windows-2019
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: libCURL
id: get_libcurl
env:
CURL_VERSION: 8.6.0_6
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" mkdir build
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server cd build
cmake .. \
- name: Python setup -DLLAMA_NATIVE=OFF \
id: setup_python -DLLAMA_BUILD_SERVER=ON \
uses: actions/setup-python@v5 -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
with: -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
python-version: '3.11' cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests dependencies - name: Tests dependencies
id: test_dependencies id: test_dependencies
run: | run: |
pip install -r examples/server/tests/requirements.txt pip install -r examples/server/tests/requirements.txt
- name: Copy Libcurl - name: Download models
id: prepare_libcurl id: download_models
run: | run: |
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll cd examples/server/tests
../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_test
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
$env:PYTHONIOENCODING = ":replace" PORT=8888 ./tests.sh
pytest -v -x -m "not slow"
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd examples/server/tests
$env:SLOW_TESTS = "1"
pytest -v -x

20
.github/workflows/tidy-post.yml vendored Normal file
View file

@ -0,0 +1,20 @@
name: clang-tidy review post comments
on:
workflow_dispatch:
workflows: ["clang-tidy-review"]
types:
- completed
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: ZedThree/clang-tidy-review/post@v0.13.0
# lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
with:
# adjust options as necessary
lgtm_comment_body: ''
annotations: false
max_comments: 25

23
.github/workflows/tidy-review.yml vendored Normal file
View file

@ -0,0 +1,23 @@
name: clang-tidy-review
on:
pull_request:
branches:
- master
jobs:
clang-tidy-review:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ZedThree/clang-tidy-review@v0.13.0
id: review
with:
lgtm_comment_body: ''
build_dir: build
cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
split_workflow: true
- uses: ZedThree/clang-tidy-review/upload@v0.13.0

25
.github/workflows/zig-build.yml vendored Normal file
View file

@ -0,0 +1,25 @@
name: Zig CI
on:
pull_request:
push:
branches:
- master
jobs:
build:
strategy:
fail-fast: false
matrix:
runs-on: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.runs-on }}
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
fetch-depth: 0
- uses: goto-bus-stop/setup-zig@v2
with:
version: 0.11.0
- name: Build Summary
run: zig build --summary all -freference-trace

179
.gitignore vendored
View file

@ -1,145 +1,94 @@
# Extensions
*.a
*.bat
*.bin
*.d
*.dll
*.dot
*.etag
*.exe
*.gcda
*.gcno
*.gcov
*.gguf
*.gguf.json
*.lastModified
*.log
*.metallib
*.o *.o
*.a
*.so *.so
*.swp *.gguf
*.tmp *.bin
*.exe
# IDE / OS *.dll
*.log
*.gcov
*.gcno
*.gcda
*.dot
*.bat
*.metallib
.DS_Store
.build/
.cache/ .cache/
.ccls-cache/ .ccls-cache/
.direnv/ .direnv/
.DS_Store
.envrc .envrc
.idea/
.swiftpm .swiftpm
.venv
.clang-tidy
.vs/ .vs/
.vscode/ .vscode/
nppBackup .idea/
# Coverage
gcovr-report/
lcov-report/ lcov-report/
gcovr-report/
# Build Artifacts
tags
.build/
build* build*
!build-info.cmake
!build-info.cpp.in
!build-info.sh
!build.zig
!docs/build.md
/libllama.so
/llama-*
/vulkan-shaders-gen
android-ndk-*
arm_neon.h
cmake-build-* cmake-build-*
CMakeSettings.json
compile_commands.json
ggml-metal-embed.metal
llama-batched-swift
/rpc-server
out/ out/
tmp/ tmp/
autogen-*.md
# Deprecated
/main
/server
# CI
!.github/workflows/*.yml
# Models
models/* models/*
models-mnt models-mnt
!models/.editorconfig
!models/ggml-vocab-*.gguf*
# Zig /Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/gguf
/gguf-llama-simple
/imatrix
/infill
/libllama.so
/llama-bench
/llava-cli
/lookahead
/lookup
/main
/metal
/passkey
/perplexity
/q8dot
/quantize
/quantize-stats
/result
/save-load-state
/server
/simple
/batched
/batched-bench
/export-lora
/finetune
/speculative
/parallel
/train-text-from-scratch
/tokenize
/vdot
/common/build-info.cpp
arm_neon.h
compile_commands.json
CMakeSettings.json
__pycache__
dist
zig-out/ zig-out/
zig-cache/ zig-cache/
# Logs
ppl-*.txt ppl-*.txt
qnt-*.txt qnt-*.txt
perf-*.txt perf-*.txt
# Examples
examples/jeopardy/results.txt examples/jeopardy/results.txt
examples/server/*.css.hpp
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
!build_64.sh
!examples/*.bat
!examples/*/*.kts
!examples/*/*/*.kts
!examples/sycl/*.bat
!examples/sycl/*.sh
# Server Web UI temporary files poetry.lock
node_modules
examples/server/webui/dist
# Python
/.venv
__pycache__/
*/poetry.lock
poetry.toml poetry.toml
nppBackup
# Nix
/result
# Test binaries
/tests/test-backend-ops
/tests/test-double-float
/tests/test-grad0
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-opt
/tests/test-quantize-fns
/tests/test-quantize-perf
/tests/test-rope
/tests/test-sampling
/tests/test-tokenizer-0
/tests/test-tokenizer-1-bpe
/tests/test-tokenizer-1-spm
# Scripts
!/scripts/install-oneapi.bat
# Test models for lora adapters
/lora-tests
# Local scripts
/run-vim.sh
/run-chat.sh

2
.gitmodules vendored
View file

@ -1,3 +1,3 @@
[submodule "kompute"] [submodule "kompute"]
path = ggml/src/ggml-kompute/kompute path = kompute
url = https://github.com/nomic-ai/kompute.git url = https://github.com/nomic-ai/kompute.git

View file

@ -3,14 +3,13 @@
exclude: prompts/.*.txt exclude: prompts/.*.txt
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0 rev: v3.2.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: end-of-file-fixer - id: end-of-file-fixer
- id: check-yaml - id: check-yaml
- id: check-added-large-files - id: check-added-large-files
- repo: https://github.com/PyCQA/flake8 - repo: https://github.com/PyCQA/flake8
rev: 7.0.0 rev: 6.0.0
hooks: hooks:
- id: flake8 - id: flake8
additional_dependencies: [flake8-no-print]

1047
AUTHORS

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,97 +0,0 @@
{
"version": 4,
"configurePresets": [
{
"name": "base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{
"name": "sycl-base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "cl",
"GGML_SYCL": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
{
"name": "x64-windows-llvm", "hidden": true,
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
}
},
{
"name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
}
},
{
"name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
}
},
{
"name": "arm64-apple-clang", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
}
},
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
{ "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
{ "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
]
}

View file

@ -1,11 +0,0 @@
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
/ci/ @ggerganov
/.devops/*.Dockerfile @ngxson
/examples/server/ @ngxson
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler

View file

@ -1,125 +0,0 @@
# Pull requests (for contributors)
- Test your changes:
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
# Pull requests (for collaborators)
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
# Coding guidelines
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
- In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
```cpp
// OK
llama_context * ctx;
const llama_rope_type rope_type;
// not OK
struct llama_context * ctx;
const enum llama_rope_type rope_type;
```
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png)
# Naming guidelines
- Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
```cpp
// not OK
int small_number;
int big_number;
// OK
int number_small;
int number_big;
```
- Enum values are always in upper case and prefixed with the enum name
```cpp
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_NONE = 0,
LLAMA_VOCAB_TYPE_SPM = 1,
LLAMA_VOCAB_TYPE_BPE = 2,
LLAMA_VOCAB_TYPE_WPM = 3,
LLAMA_VOCAB_TYPE_UGM = 4,
LLAMA_VOCAB_TYPE_RWKV = 5,
};
```
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
```cpp
llama_model_init(); // class: "llama_model", method: "init"
llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed"
llama_set_embeddings(); // class: "llama_context", method: "set_embeddings"
llama_n_threads(); // class: "llama_context", method: "n_threads"
llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free"
```
- The `get` `<action>` can be omitted
- The `<noun>` can be omitted if not necessary
- The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
- Use `init`/`free` for constructor/destructor `<action>`
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
```cpp
typedef struct llama_context * llama_context_t;
enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
```
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
- Python filenames are all lowercase with underscores
- _(TODO: abbreviations usage)_
# Preprocessor directives
- _(TODO: add guidelines with examples and apply them to the codebase)_
```cpp
#ifdef FOO
#endif // FOO
```
# Documentation
- Documentation is a community effort
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
- When you notice incorrect or outdated documentation, please update it
# Resources
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
https://github.com/ggerganov/llama.cpp/projects

View file

@ -1,6 +1,6 @@
MIT License MIT License
Copyright (c) 2023-2024 The ggml authors Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

1361
Makefile

File diff suppressed because it is too large Load diff

View file

@ -14,6 +14,47 @@ let package = Package(
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
targets: [ targets: [
.systemLibrary(name: "llama", pkgConfig: "llama"), .target(
name: "llama",
path: ".",
exclude: [
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"ggml-cuda.cu",
"ggml-cuda.h",
"Makefile"
],
sources: [
"ggml.c",
"llama.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
],
publicHeadersPath: "spm-headers",
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_ACCELERATE"),
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_USE_METAL"),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
],
linkerSettings: [
.linkedFramework("Accelerate")
] ]
)
],
cxxLanguageStandard: .cxx11
) )

494
README-sycl.md Normal file
View file

@ -0,0 +1,494 @@
# llama.cpp for SYCL
- [Background](#background)
- [OS](#os)
- [Intel GPU](#intel-gpu)
- [Docker](#docker)
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
- [Known Issue](#known-issue)
- [Q&A](#q&a)
- [Todo](#todo)
## Background
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
The llama.cpp for SYCL is used to support Intel GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## OS
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|Windows|Support|Windows 11|
## Intel GPU
### Verified
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770, 730M|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
### Memory
The memory is a limitation to run LLM on GPUs.
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
## Docker
Note:
- Only docker on Linux is tested. Docker on WSL may not work.
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
### Build the image
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
```sh
# For F16:
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
# Or, for F32:
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
```
### Run
```sh
# Firstly, find all the DRI cards:
ls -la /dev/dri
# Then, pick the card that you want to use.
# For example with "/dev/dri/card1"
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
## Linux
### Setup Environment
1. Install Intel GPU driver.
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
Note: for iGPU, please install the client GPU driver.
b. Add user to group: video, render.
```sh
sudo usermod -aG render username
sudo usermod -aG video username
```
Note: re-login to enable it.
c. Check
```sh
sudo apt install clinfo
sudo clinfo -l
```
Output (example):
```
Platform #0: Intel(R) OpenCL Graphics
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
2. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Check
```sh
source /opt/intel/oneapi/setvars.sh
sycl-ls
```
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
```
2. Build locally:
Note:
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
```sh
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
# For FP16:
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Or, for FP32:
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Build example/main only
#cmake --build . --config Release --target main
# Or, build all binary
cmake --build . --config Release -v
cd ..
```
or
```sh
./examples/sycl/build.sh
```
### Run
1. Put model file to folder **models**
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
2. Enable oneAPI running environment
```
source /opt/intel/oneapi/setvars.sh
```
3. List device ID
Run without parameter:
```sh
./build/bin/ls-sycl-device
# or running the "main" executable and look at the output log:
./build/bin/main
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Set device ID and execute llama.cpp
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
```sh
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
or run by script:
```sh
./examples/sycl/run_llama2.sh
```
Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
5. Check the device ID in output
Like:
```
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Windows
### Setup Environment
1. Install Intel GPU driver.
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
Note: **The driver is mandatory for compute function**.
2. Install Visual Studio.
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
3. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Enable oneAPI running environment:
- In Search, input 'oneAPI'.
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
c. Check GPU
In oneAPI command line:
```
sycl-ls
```
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
```
4. Install cmake & make
a. Download & install cmake for Windows: https://cmake.org/download/
b. Download & install mingw-w64 make for Windows provided by w64devkit
- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
- Extract `w64devkit` on your pc.
- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
### Build locally:
In oneAPI command line window:
```
mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
:: for FP16
:: faster for long-prompt inference
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
:: for FP32
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
:: build example/main only
:: make main
:: build all binary
make -j
cd ..
```
or
```
.\examples\sycl\win-build-sycl.bat
```
Note:
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### Run
1. Put model file to folder **models**
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
2. Enable oneAPI running environment
- In Search, input 'oneAPI'.
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
3. List device ID
Run without parameter:
```
build\bin\ls-sycl-device.exe
or
build\bin\main.exe
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Set device ID and execute llama.cpp
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
```
set GGML_SYCL_DEVICE=0
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
```
or run by script:
```
.\examples\sycl\win-run-llama2.bat
```
Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
5. Check the device ID in output
Like:
```
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Environment Variable
#### Build
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
#### Running
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
## Known Issue
- Hang during startup
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
Solution: add **--no-mmap** or **--mmap 0**.
## Q&A
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- In Windows, no result, not error.
Miss to enable oneAPI running environment.
- Meet compile error.
Remove folder **build** and try again.
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
Please run **sudo sycl-ls**.
If you see it in result, please add video/render group to your ID:
```
sudo usermod -aG render username
sudo usermod -aG video username
```
Then **relogin**.
If you do not see it, please check the installation GPU steps again.
## Todo
- Support multiple cards.

1185
README.md

File diff suppressed because it is too large Load diff

View file

@ -1,67 +0,0 @@
# Security Policy
- [**Using llama.cpp securely**](#using-llamacpp-securely)
- [Untrusted models](#untrusted-models)
- [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy)
- [Untrusted environments or networks](#untrusted-environments-or-networks)
- [Multi-Tenant environments](#multi-tenant-environments)
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
## Using llama.cpp securely
### Untrusted models
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
> [!NOTE]
> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
### Untrusted inputs
Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
For maximum security when handling untrusted inputs, you may need to employ the following:
* Sandboxing: Isolate the environment where the inference happens.
* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
* Validation: Enforce strict rules on allowed characters and data types.
* Filtering: Remove potentially malicious scripts or code fragments.
* Encoding: Convert special characters into safe representations.
* Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
### Data privacy
To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
### Untrusted environments or networks
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
* Encrypt your data if sending it over the network.
### Multi-Tenant environments
If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
<!-- normal version -->
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

View file

@ -1,4 +0,0 @@
#pragma once
#include <llama.h>

View file

@ -1,5 +0,0 @@
module llama [system] {
header "llama.h"
link "llama"
export *
}

116
awq-py/README.md Normal file
View file

@ -0,0 +1,116 @@
# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
**Supported models:**
- [X] LLaMA
- [x] LLaMA 2
- [X] MPT
- [X] Mistral AI v0.1
- [ ] Bloom
- [ ] Mixtral MoE
**TODO:**
- [x] Update version work with both MPT and MPT-AWQ model
- [ ] Add OPT model
- [ ] Add Bloom model
- [ ] Add Mixtral MoE
- [ ] Support w3, w2
## Contents
- [Install](##Install)
- [Convert](##Convert)
- [Quantize](##Quantize)
- [Test](##Test)
- [Benchmark](##Benchmark)
- [Results](##Results)
## Install
Install requirements
```bash
pip install -r requirements.txt
```
Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
```bash
git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
```
## Convert
Example for llama model
```bash
# For llama7b and llama2 models
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
# For mistral and mpt models
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
```
## Quantize
```bash
# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
```
## Test
```bash
# For all models.
./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
```
## Benchmark
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
```bash
# For llama and llama2, and mistral models.
./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
```
## Results
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
### Llama 7B (Build with OpenBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|-----------:|--------------|-------:|-------:|-------:|-------:|
|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### Llama2 7B (Build with CuBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|------------:|--------------|-------:|-------:|-------:|-------:|
|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### Mistral 7B v0.1 (Build with CuBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|-------------:|--------------|-------:|-------:|-------:|-------:|
|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G |
|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G |
|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
### MPT 7B (Build with OpenBLAS)
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|---------:|--------------|-------:|-------:|-------:|--------:|
|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G |
|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873|
|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G |
|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |

254
awq-py/awq/apply_awq.py Normal file
View file

@ -0,0 +1,254 @@
"""
Implements the AWQ for llama.cpp use cases.
Original paper: https://arxiv.org/abs/2306.00978
This code is based on versions of the AWQ implementation found in the following repositories:
* https://github.com/mit-han-lab/llm-awq
* https://github.com/casper-hansen/AutoAWQ
"""
import os
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoConfig
from transformers.models.bloom.modeling_bloom import BloomGelu
from transformers.models.llama.modeling_llama import LlamaRMSNorm
from transformers.activations import GELUActivation
class ScaledActivation(nn.Module):
"""
ScaledActivation module wraps an existing activation function and applies a
scale factor to its output.
Args:
module (nn.Module): The activation function to be scaled.
scales (torch.Tensor): A tensor of size (num_features,) containing the initial
scale factors for each feature.
Returns:
torch.Tensor: The scaled output of the activation function.
"""
def __init__(self, module, scales):
super().__init__()
self.act = module
self.scales = nn.Parameter(scales.data)
def forward(self, x):
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
def set_op_by_name(layer, name, new_module):
"""
Set the new module for given module's name.
Args:
layer (nn.Module): The layer in which to replace the submodule.
name (str): The path to the submodule to be replaced, using dot notation
to access nested modules.
new_module (nn.Module): The new module to replace the existing one.
"""
levels = name.split(".")
if len(levels) > 1:
mod_ = layer
for l_idx in range(len(levels) - 1):
if levels[l_idx].isdigit():
mod_ = mod_[int(levels[l_idx])]
else:
mod_ = getattr(mod_, levels[l_idx])
setattr(mod_, levels[-1], new_module)
else:
setattr(layer, name, new_module)
def get_op_by_name(module, op_name):
"""
Retrieves a submodule within a given layer based on its name.
Args:
module (nn.Module): The layer containing the submodule to find.
op_name (str): The name of the submodule.
Returns:
nn.Module: The requested submodule found within the given layer.
Raises:
ValueError: If the specified submodule cannot be found within the layer.
"""
for name, m in module.named_modules():
if name == op_name:
return m
raise ValueError(f"Cannot find op {op_name} in module {module}")
@torch.no_grad()
def scale_ln_fcs(ln, fcs, scales):
"""
Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
Args:
ln (nn.LayerNorm): The LayerNorm module to be scaled.
fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
"""
if not isinstance(fcs, list):
fcs = [fcs]
scales = scales.to(ln.weight.device)
ln.weight.div_(scales)
if hasattr(ln, "bias") and ln.bias is not None:
ln.bias.div_(scales)
for fc in fcs:
fc.weight.mul_(scales.view(1, -1))
for p in ln.parameters():
assert torch.isnan(p).sum() == 0
for fc in fcs:
for p in fc.parameters():
assert torch.isnan(p).sum() == 0
@torch.no_grad()
def scale_fc_fc(fc1, fc2, scales):
"""
Scales the weights of two fully-connected layers in a specific pattern.
Args:
fc1 (nn.Linear): The first fully-connected layer to be scaled.
fc2 (nn.Linear): The second fully-connected layer to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
"""
assert isinstance(fc1, nn.Linear)
assert isinstance(fc2, nn.Linear)
scales = scales.to(fc1.weight.device)
fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
if fc1.bias is not None:
fc1.bias.div_(scales.view(-1))
fc2.weight.mul_(scales.view(1, -1))
for p in fc1.parameters():
assert torch.isnan(p).sum() == 0
for p in fc2.parameters():
assert torch.isnan(p).sum() == 0
@torch.no_grad()
def scale_gelu_fc(gelu, fc, scales):
"""
Scales the weight of a GELU activation and a fully-connected layer proportionally.
Args:
gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
fc (nn.Linear): The fully-connected layer to be scaled.
scales (torch.Tensor): A 1D tensor of size (num_features,).
Raises:
TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
TypeError: If the `fc` module is not of type `nn.Linear`.
"""
assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
assert isinstance(fc, nn.Linear)
fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
for p in fc.parameters():
assert torch.isnan(p).sum() == 0
def apply_scale(module, scales_list, input_feat_dict=None):
"""
Applies different scaling strategies to layers based on their type and hierarchy within a given module.
Args:
module (nn.Module): The module containing the layers to be scaled.
scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
* prev_op_name (str): The name of the preceding operation or module,
relative to which the layers to be scaled are located.
* layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
* scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
input features (optional).
"""
for prev_op_name, layer_names, scales in scales_list:
prev_op = get_op_by_name(module, prev_op_name)
layers = [get_op_by_name(module, name) for name in layer_names]
prev_op.cuda()
for layer in layers:
layer.cuda()
scales.cuda()
if isinstance(prev_op, nn.Linear):
assert len(layers) == 1
scale_fc_fc(prev_op, layers[0], scales)
elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
scale_ln_fcs(prev_op, layers, scales)
elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
new_module = ScaledActivation(prev_op, scales)
set_op_by_name(module, prev_op_name, new_module)
scale_gelu_fc(prev_op, layers[0], scales)
else:
raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
# apply the scaling to input feat if given; prepare it for clipping
if input_feat_dict is not None:
for layer_name in layer_names:
inp = input_feat_dict[layer_name]
inp.div_(scales.view(1, -1).to(inp.device))
prev_op.cpu()
for layer in layers:
layer.cpu()
scales.cpu()
@torch.no_grad()
def apply_clip(module, clip_list):
"""
Applies element-wise clipping to the weight of a specific layer within a given module.
Args:
module (nn.Module): The module containing the layer to be clipped.
clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
* name (str): The name of the layer to be clipped, relative to the root of the module.
* max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
"""
for name, max_val in clip_list:
layer = get_op_by_name(module, name)
layer.cuda()
max_val = max_val.to(layer.weight.device)
org_shape = layer.weight.shape
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
layer.weight.data = layer.weight.data.reshape(org_shape)
layer.cpu()
def add_scale_weights(model_path, scale_path, tmp_path):
"""
Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
including scaling factors and clipping bounds.
Args:
model_path (str): Path to the pre-trained model to be equipped with AWQ.
scale_path (str): Path to the AWQ scale factors (.pt file).
tmp_path (str): Path to the temporary directory where the equipped model will be saved.
"""
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path, config=config, trust_remote_code=True
)
model.eval()
awq_results = torch.load(str(scale_path), map_location="cpu")
apply_scale(model, awq_results["scale"])
apply_clip(model, awq_results["clip"])
model.save_pretrained(str(tmp_path))
os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")

2
awq-py/requirements.txt Normal file
View file

@ -0,0 +1,2 @@
torch>=2.1.1
transformers>=4.32.0

139
build.zig Normal file
View file

@ -0,0 +1,139 @@
// Compatible with Zig Version 0.11.0
const std = @import("std");
const ArrayList = std.ArrayList;
const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode;
const CrossTarget = std.zig.CrossTarget;
const Maker = struct {
builder: *std.build.Builder,
target: CrossTarget,
optimize: Mode,
enable_lto: bool,
include_dirs: ArrayList([]const u8),
cflags: ArrayList([]const u8),
cxxflags: ArrayList([]const u8),
objs: ArrayList(*Compile),
fn addInclude(m: *Maker, dir: []const u8) !void {
try m.include_dirs.append(dir);
}
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
}
fn addCFlag(m: *Maker, flag: []const u8) !void {
try m.cflags.append(flag);
}
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
try m.cxxflags.append(flag);
}
fn addFlag(m: *Maker, flag: []const u8) !void {
try m.addCFlag(flag);
try m.addCxxFlag(flag);
}
fn init(builder: *std.build.Builder) !Maker {
const target = builder.standardTargetOptions(.{});
const zig_version = @import("builtin").zig_version_string;
const commit_hash = try std.ChildProcess.exec(
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
);
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
\\int LLAMA_BUILD_NUMBER = {};
\\char const *LLAMA_COMMIT = "{s}";
\\char const *LLAMA_COMPILER = "Zig {s}";
\\char const *LLAMA_BUILD_TARGET = "{s}";
\\
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
var m = Maker{
.builder = builder,
.target = target,
.optimize = builder.standardOptimizeOption(.{}),
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
};
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"common"});
return m;
}
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (o.target.getAbi() != .msvc)
o.defineCMacro("_GNU_SOURCE", null);
if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC();
} else {
o.addCSourceFiles(&.{src}, m.cxxflags.items);
if (o.target.getAbi() == .msvc) {
o.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
o.linkLibCpp();
}
}
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto;
return o;
}
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addCSourceFiles(&.{src}, m.cxxflags.items);
for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
// https://github.com/ziglang/zig/issues/15448
if (e.target.getAbi() == .msvc) {
e.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
e.linkLibCpp();
}
m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
return e;
}
};
pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp");
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
}

619
ci/run.sh
View file

@ -1,4 +1,4 @@
#!/bin/bash #/bin/bash
# #
# sample usage: # sample usage:
# #
@ -13,9 +13,6 @@
# # with SYCL support # # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# #
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>" echo "usage: $0 <output-dir> <mnt-dir>"
@ -39,25 +36,20 @@ SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON" CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
fi fi
if [ ! -z ${GG_BUILD_CUDA} ]; then if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native" CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
fi fi
if [ ! -z ${GG_BUILD_SYCL} ]; then if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then if [ -z ${ONEAPI_ROOT} ]; then
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:" echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
echo "source /opt/intel/oneapi/setvars.sh"
exit 1 exit 1
fi fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi fi
## helpers ## helpers
@ -110,11 +102,8 @@ function gg_run_ctest_debug {
set -e set -e
# Check cmake, make and ctest are installed
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -141,11 +130,8 @@ function gg_run_ctest_release {
set -e set -e
# Check cmake, make and ctest are installed
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -166,64 +152,13 @@ function gg_sum_ctest_release {
gg_printf '```\n' gg_printf '```\n'
} }
# test_scripts_debug
function gg_run_test_scripts_debug {
cd ${SRC}
set -e
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
function gg_sum_test_scripts_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
gg_printf '```\n'
gg_printf '\n'
}
# test_scripts_release
function gg_run_test_scripts_release {
cd ${SRC}
set -e
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
function gg_sum_test_scripts_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
gg_printf '```\n'
gg_printf '\n'
}
function gg_get_model { function gg_get_model {
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf" local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf" local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" if [[ -s $gguf_3b ]]; then
if [[ -s $gguf_0 ]]; then echo -n "$gguf_3b"
echo -n "$gguf_0" elif [[ -s $gguf_7b ]]; then
elif [[ -s $gguf_1 ]]; then echo -n "$gguf_7b"
echo -n "$gguf_1"
elif [[ -s $gguf_2 ]]; then
echo -n "$gguf_2"
else else
echo >&2 "No model found. Can't run gg_run_ctest_with_model." echo >&2 "No model found. Can't run gg_run_ctest_with_model."
exit 1 exit 1
@ -272,34 +207,33 @@ function gg_sum_ctest_with_model_release {
gg_printf '```\n' gg_printf '```\n'
} }
# open_llama_7b_v2 # open_llama_3b_v2
function gg_run_open_llama_7b_v2 { function gg_run_open_llama_3b_v2 {
cd ${SRC} cd ${SRC}
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/open-llama/7B-v2" path_models="../models-mnt/open-llama/3B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw" path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -313,49 +247,46 @@ function gg_run_open_llama_7b_v2 {
model_q5_k="${path_models}/ggml-model-q5_k.gguf" model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf" model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw" wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 ./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 ./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 ./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 ./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 ./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k ./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k ./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k ./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -384,148 +315,58 @@ function gg_run_open_llama_7b_v2 {
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e # lora
} function compare_ppl {
function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# pythia_1.4b
function gg_run_pythia_1_4b {
cd ${SRC}
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/pythia/1.4B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1" qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl" printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20 return 20
fi fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl" printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0 return 0
} }
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log path_lora="../models-mnt/open-llama/3B-v2/lora"
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log path_shakespeare="../models-mnt/shakespeare"
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e set +e
} }
function gg_sum_pythia_1_4b { function gg_sum_open_llama_3b_v2 {
gg_printf '### %s\n\n' "${ci}" gg_printf '### %s\n\n' "${ci}"
gg_printf 'Pythia 1.4B:\n' gg_printf 'OpenLLaMA 3B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -538,33 +379,42 @@ function gg_sum_pythia_1_4b {
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
} }
# pythia_2_8b # open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_pythia_2_8b { function gg_run_open_llama_7b_v2 {
cd ${SRC} cd ${SRC}
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/pythia/2.8B" path_models="../models-mnt/open-llama/7B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw" path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -580,47 +430,44 @@ function gg_run_pythia_2_8b {
wiki_test="${path_wiki}/wiki.test.raw" wiki_test="${path_wiki}/wiki.test.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 ./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 ./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 ./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 ./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 ./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k ./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k ./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k ./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -641,7 +488,7 @@ function gg_run_pythia_2_8b {
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@ -649,16 +496,59 @@ function gg_run_pythia_2_8b {
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/7B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# currently not supported by the CUDA backend
# q8_0
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e set +e
} }
function gg_sum_pythia_2_8b { function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' "${ci}" gg_printf '### %s\n\n' "${ci}"
gg_printf 'Pythia 2.8B:\n' gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -671,6 +561,11 @@ function gg_sum_pythia_2_8b {
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
} }
# bge-small # bge-small
@ -679,7 +574,7 @@ function gg_run_embd_bge_small {
cd ${SRC} cd ${SRC}
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
@ -697,17 +592,17 @@ function gg_run_embd_bge_small {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../convert-hf-to-gguf.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 ./bin/quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e set +e
} }
@ -721,92 +616,8 @@ function gg_sum_embd_bge_small {
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
} }
# rerank_tiny
function gg_run_rerank_tiny {
cd ${SRC}
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
path_models="../models-mnt/rerank-tiny"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
# for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029
# rerank score 1: 0.029
# rerank score 2: 0.135
# check that the score is in the range [$3, $4]
function check_score {
qnt="$1"
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$score"
return 0
}
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
set +e
}
function gg_sum_rerank_tiny {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Rerank Tiny (Jina):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
}
function gg_check_build_requirements {
if ! command -v cmake &> /dev/null; then
gg_printf 'cmake not found, please install'
fi
if ! command -v make &> /dev/null; then
gg_printf 'make not found, please install'
fi
if ! command -v ctest &> /dev/null; then
gg_printf 'ctest not found, please install'
fi
}
## main ## main
export LLAMA_LOG_PREFIX=1
export LLAMA_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
rm -rf ${SRC}/models-mnt rm -rf ${SRC}/models-mnt
@ -815,10 +626,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
ln -sfn ${mnt_models} ${SRC}/models-mnt ln -sfn ${mnt_models} ${SRC}/models-mnt
# Create a fresh python3 venv and enter it # Create a fresh python3 venv and enter it
if ! python3 -m venv "$MNT/venv"; then python3 -m venv "$MNT/venv"
echo "Error: Failed to create Python virtual environment at $MNT/venv."
exit 1
fi
source "$MNT/venv/bin/activate" source "$MNT/venv/bin/activate"
pip install -r ${SRC}/requirements.txt --disable-pip-version-check pip install -r ${SRC}/requirements.txt --disable-pip-version-check
@ -832,19 +640,12 @@ test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small test $ret -eq 0 && gg_run embd_bge_small
test $ret -eq 0 && gg_run rerank_tiny
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run pythia_1_4b test $ret -eq 0 && gg_run open_llama_3b_v2
else else
test $ret -eq 0 && gg_run pythia_2_8b test $ret -eq 0 && gg_run open_llama_7b_v2
#test $ret -eq 0 && gg_run open_llama_7b_v2
fi fi
test $ret -eq 0 && gg_run ctest_with_model_debug test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release test $ret -eq 0 && gg_run ctest_with_model_release

View file

@ -79,22 +79,22 @@ endmacro()
# flags are for MSVC only! # flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX") check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND}) if (NOT ${AVX_FOUND})
set(GGML_AVX OFF) set(LLAMA_AVX OFF)
else() else()
set(GGML_AVX ON) set(LLAMA_AVX ON)
endif() endif()
check_sse("AVX2" " ;/arch:AVX2") check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2") check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND})) if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
set(GGML_AVX2 OFF) set(LLAMA_AVX2 OFF)
else() else()
set(GGML_AVX2 ON) set(LLAMA_AVX2 ON)
endif() endif()
check_sse("AVX512" " ;/arch:AVX512") check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND}) if (NOT ${AVX512_FOUND})
set(GGML_AVX512 OFF) set(LLAMA_AVX512 OFF)
else() else()
set(GGML_AVX512 ON) set(LLAMA_AVX512 ON)
endif() endif()

View file

@ -1,16 +0,0 @@
set( CMAKE_SYSTEM_NAME Darwin )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-apple-darwin-macho )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View file

@ -1,16 +0,0 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View file

@ -1,6 +0,0 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

View file

@ -1,33 +0,0 @@
function(llama_add_compile_flags)
if (LLAMA_FATAL_WARNINGS)
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
list(APPEND C_FLAGS -Werror)
list(APPEND CXX_FLAGS -Werror)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
add_compile_options(/WX)
endif()
endif()
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-Werror=implicit-int -Werror=implicit-function-declaration)
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
list(APPEND C_FLAGS ${WARNING_FLAGS})
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
else()
# todo : msvc
set(C_FLAGS "" PARENT_SCOPE)
set(CXX_FLAGS "" PARENT_SCOPE)
endif()
endif()
endfunction()

View file

@ -1,22 +0,0 @@
find_package(Git)
# the commit's SHA1
execute_process(COMMAND
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_SHA1
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# the date of the commit
execute_process(COMMAND
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_DATE
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# the subject of the commit
execute_process(COMMAND
"${GIT_EXECUTABLE}" log -1 --format=%s
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

View file

@ -1,30 +0,0 @@
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
@PACKAGE_INIT@
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
find_library(llama_LIBRARY llama
REQUIRED
HINTS ${LLAMA_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH
)
add_library(llama UNKNOWN IMPORTED)
set_target_properties(llama
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${llama_LIBRARY}"
INTERFACE_COMPILE_FEATURES c_std_90
POSITION_INDEPENDENT_CODE ON)
check_required_components(Llama)

View file

@ -1,10 +0,0 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=@CMAKE_INSTALL_PREFIX@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
Name: llama
Description: Port of Facebook's LLaMA model in C/C++
Version: @LLAMA_INSTALL_VERSION@
Libs: -L${libdir} -lggml -lggml-base -lllama
Cflags: -I${includedir}

View file

@ -1,11 +0,0 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( arch_c_flags "-march=native" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )

14
codecov.yml Normal file
View file

@ -0,0 +1,14 @@
comment: off
coverage:
status:
project:
default:
target: auto
threshold: 0
base: auto
patch:
default:
target: auto
threshold: 0
base: auto

View file

@ -1,8 +1,5 @@
# common # common
find_package(Threads REQUIRED)
llama_add_compile_flags()
# Build info header # Build info header
# #
@ -22,12 +19,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
endif() endif()
endif() endif()
if(EXISTS "${GIT_DIR}/index")
set(GIT_INDEX "${GIT_DIR}/index") set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git index not found in git repository.")
set(GIT_INDEX "")
endif()
else() else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "") set(GIT_INDEX "")
@ -39,7 +31,7 @@ add_custom_command(
COMMENT "Generating build details from Git" COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake" -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.." WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX} DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM VERBATIM
@ -50,75 +42,27 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif() endif()
set(TARGET common) set(TARGET common)
add_library(${TARGET} STATIC add_library(${TARGET} STATIC
arg.cpp
arg.h
base64.hpp base64.hpp
chat.cpp
chat.hpp
chat-template.hpp
common.cpp
common.h common.h
console.cpp common.cpp
console.h
json-schema-to-grammar.cpp
json.hpp
llguidance.cpp
log.cpp
log.h
minja.hpp
ngram-cache.cpp
ngram-cache.h
sampling.cpp
sampling.h sampling.h
speculative.cpp sampling.cpp
speculative.h console.h
console.cpp
grammar-parser.h
grammar-parser.cpp
train.h
train.cpp
) )
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif() endif()
set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url
if (LLAMA_CURL)
find_package(CURL REQUIRED)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif ()
if (LLAMA_LLGUIDANCE)
include(ExternalProject)
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.6.12:
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo build --release
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
UPDATE_COMMAND ""
)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
add_library(llguidance STATIC IMPORTED)
set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
add_dependencies(llguidance llguidance_ext)
target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
endif ()
target_include_directories(${TARGET} PUBLIC .) target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_17) target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)

Some files were not shown because too many files have changed in this diff Show more