Merge branch 'ggerganov:master' into master

This commit is contained in:
jiahao su 2025-01-16 10:27:59 +08:00 committed by GitHub
commit f77d5ce232
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
640 changed files with 87013 additions and 81204 deletions

161
.clang-format Normal file
View file

@ -0,0 +1,161 @@
---
Language: Cpp
AlignAfterOpenBracket: Align
AlignArrayOfStructures: Left
AlignConsecutiveAssignments: AcrossComments
AlignConsecutiveBitFields: AcrossComments
AlignConsecutiveDeclarations: AcrossComments
AlignConsecutiveMacros: AcrossComments
# AlignConsecutiveShortCaseStatements: AcrossComments
AlignEscapedNewlines: Left # LeftWithLastLine
AlignOperands: Align
AlignTrailingComments:
Kind: Always
OverEmptyLines: 1
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: false
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: true
BinPackParameters: true # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
AfterCaseLabel: true
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
# BreakAdjacentStringLiterals: true
BreakAfterAttributes: Never
BreakBeforeBinaryOperators: None
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: false
# BreakBinaryOperations: Never
BreakConstructorInitializers: AfterColon
# BreakFunctionDefinitionParameters: false
BreakInheritanceList: AfterComma
BreakStringLiterals: true
# BreakTemplateDeclarations: Yes
ColumnLimit: 120
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
EmptyLineBeforeAccessModifier: Leave
EmptyLineAfterAccessModifier: Never
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
Priority: 2
SortPriority: 0
- Regex: '.*'
Priority: 3
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseBlocks: true
IndentCaseLabels: true
IndentExternBlock: NoIndent
IndentGotoLabels: false
IndentPPDirectives: AfterHash
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertBraces: true # NOTE: may lead to incorrect formatting
InsertNewlineAtEOF: true
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
LambdaBodyIndentation: Signature
LineEnding: LF
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PPIndentWidth: -1
PackConstructorInitializers: CurrentLine
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Middle
QualifierAlignment: Left
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
ReferenceAlignment: Middle
ReflowComments: false # IndentOnly
SeparateDefinitionBlocks: Always
SortIncludes: CaseInsensitive
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: true
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: Never
SpacesInContainerLiterals: true
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
Standard: c++17
TabWidth: 4
UseTab: Never
WhitespaceSensitiveMacros: ['STRINGIZE']
...

View file

@ -17,8 +17,10 @@ Checks: >
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*, performance-*,
portability-*, portability-*,
-portability-simd-intrinsics,
misc-*, misc-*,
-misc-const-correctness, -misc-const-correctness,
-misc-non-private-member-variables-in-classes, -misc-non-private-member-variables-in-classes,
-misc-no-recursion, -misc-no-recursion,
-misc-use-anonymous-namespace,
FormatStyle: none FormatStyle: none

81
.devops/cpu.Dockerfile Normal file
View file

@ -0,0 +1,81 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build build -j $(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

94
.devops/cuda.Dockerfile Normal file
View file

@ -0,0 +1,94 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
WORKDIR /app
COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -1,33 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \
cp build/bin/* .
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -1,26 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \
cp build/bin/* .
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -1,50 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -1,25 +0,0 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
ENV LLAMA_CURL=1
RUN make -j$(nproc)
ENV LC_ALL=C.utf8
ENTRYPOINT ["/app/.devops/tools.sh"]

91
.devops/intel.Dockerfile Normal file
View file

@ -0,0 +1,91 @@
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
## Build Image
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
### Full
FROM base AS full
COPY --from=build /app/lib/ /app
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -1,6 +1,6 @@
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8 ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
FROM cosdt/cann:$ASCEND_VERSION AS build FROM ascendai/cann:$ASCEND_VERSION AS build
WORKDIR /app WORKDIR /app
@ -22,11 +22,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \ RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \ cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli cmake --build build --config Release --target llama-cli
# TODO: use image with NNRT # TODO: use image with NNRT
FROM cosdt/cann:$ASCEND_VERSION AS runtime FROM ascendai/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8

View file

@ -1,37 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git cmake
WORKDIR /app
COPY . .
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ]

View file

@ -1,28 +0,0 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with static libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View file

@ -1,30 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the MUSA runtime image
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
RUN apt-get update && \
apt-get install -y build-essential git cmake
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc)
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ]

View file

@ -1,45 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make -j$(nproc) llama-cli
ENTRYPOINT [ "/app/llama-cli" ]

View file

@ -1,27 +0,0 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget libgomp1
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_VULKAN=1 && \
cmake --build build --config Release --target llama-cli
# Clean up
WORKDIR /
RUN cp /app/build/bin/llama-cli /llama-cli && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View file

@ -1,23 +0,0 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
RUN make -j$(nproc) llama-cli
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View file

@ -1,42 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -1,34 +0,0 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -1,35 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the MUSA runtime image
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc)
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -1,54 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl
RUN make -j$(nproc) llama-server
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -1,31 +0,0 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK and cURL
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release --target llama-server
# Clean up
WORKDIR /
RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -1,29 +0,0 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
WORKDIR /app
COPY . .
ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/llama-server /llama-server
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

108
.devops/musa.Dockerfile Normal file
View file

@ -0,0 +1,108 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y \
build-essential \
cmake \
python3 \
python3-pip \
git \
libcurl4-openssl-dev \
libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -31,6 +31,7 @@
# Increases the runtime closure size by ~700M # Increases the runtime closure size by ~700M
useMpi ? false, useMpi ? false,
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true, enableCurl ? true,
useVulkan ? false, useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@ -126,9 +127,9 @@ effectiveStdenv.mkDerivation (finalAttrs: {
}; };
postPatch = '' postPatch = ''
substituteInPlace ./ggml/src/ggml-metal.m \ substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml/src/ggml-metal.m \ substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
''; '';
@ -173,7 +174,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
(cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda) (cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIPBLAS" useRocm) (cmakeBool "GGML_HIP" useRocm)
(cmakeBool "GGML_METAL" useMetalKit) (cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan) (cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic) (cmakeBool "GGML_STATIC" enableStatic)
@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
] ]
++ optionals useRocm [ ++ optionals useRocm [
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets)) (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
] ]
++ optionals useMetalKit [ ++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")

View file

@ -34,7 +34,7 @@ let
# server tests # server tests
openai openai
behave pytest
prometheus-client prometheus-client
]; ];
in in

113
.devops/rocm.Dockerfile Normal file
View file

@ -0,0 +1,113 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=6.3
ARG AMDGPU_VERSION=6.3
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
### Build image
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
# gfx906 is deprecated
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
ARG ROCM_DOCKER_ARCH=gfx1100
# Set nvcc architectured
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
# ENV CC=/opt/rocm/llvm/bin/clang
# ENV CXX=/opt/rocm/llvm/bin/clang++
RUN apt-get update \
&& apt-get install -y \
build-essential \
cmake \
git \
libcurl4-openssl-dev \
curl \
libgomp1
WORKDIR /app
COPY . .
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
&& cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib \
&& find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3-pip \
python3 \
python3-wheel\
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -8,11 +8,11 @@ arg1="$1"
shift shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert_hf_to_gguf.py "$@" exec python3 ./convert_hf_to_gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@" exec ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@" exec ./llama-cli "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}" echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./llama-quantize "$i" "${i/f16/q4_0}" q4_0 exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
fi fi
done done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./llama-server "$@" exec ./llama-server "$@"
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "

88
.devops/vulkan.Dockerfile Normal file
View file

@ -0,0 +1,88 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK and cURL
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -1,50 +0,0 @@
name: Low Severity Bugs
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
title: "Bug: "
labels: ["bug-unconfirmed", "low severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -0,0 +1,87 @@
name: Bug (compilation)
description: Something goes wrong when trying to compile llama.cpp.
title: "Compile bug: "
labels: ["bug-unconfirmed", "compilation"]
body:
- type: markdown
attributes:
value: >
Thanks for taking the time to fill out this bug report!
This issue template is intended for bug reports where the compilation of llama.cpp fails.
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
by clearing `~/.cache/ccache` (on Linux).
- type: textarea
id: commit
attributes:
label: Git commit
description: Which commit are you trying to compile?
placeholder: |
$git rev-parse HEAD
84a07a17b1b08cf2b9747c633a2372782848a27f
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: true
- type: dropdown
id: backends
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true
validations:
required: true
- type: textarea
id: info
attributes:
label: Problem description & steps to reproduce
description: >
Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
placeholder: >
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
Here are the exact commands that I used: ...
validations:
required: true
- type: textarea
id: first_bad_commit
attributes:
label: First Bad Commit
description: >
If the bug was not present on an earlier version: when did it start appearing?
If possible, please do a git bisect and identify the exact commit that introduced the bug.
validations:
required: false
- type: textarea
id: command
attributes:
label: Compile command
description: >
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: true
- type: textarea
id: logs
attributes:
label: Relevant log output
description: >
Please copy and paste any relevant log output, including any generated text.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: true

View file

@ -0,0 +1,101 @@
name: Bug (model use)
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
title: "Eval bug: "
labels: ["bug-unconfirmed", "model evaluation"]
body:
- type: markdown
attributes:
value: >
Thanks for taking the time to fill out this bug report!
This issue template is intended for bug reports where the model evaluation results
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
If you encountered the issue while using an external UI (e.g. ollama),
please reproduce your issue using one of the examples/binaries in this repository.
The `llama-cli` binary can be used for simple and reproducible model inference.
- type: textarea
id: version
attributes:
label: Name and Version
description: Which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: true
- type: dropdown
id: backends
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true
validations:
required: true
- type: textarea
id: hardware
attributes:
label: Hardware
description: Which CPUs/GPUs are you using?
placeholder: >
e.g. Ryzen 5950X + 2x RTX 4090
validations:
required: true
- type: textarea
id: model
attributes:
label: Models
description: >
Which model(s) at which quantization were you using when encountering the bug?
If you downloaded a GGUF file off of Huggingface, please provide a link.
placeholder: >
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
validations:
required: false
- type: textarea
id: info
attributes:
label: Problem description & steps to reproduce
description: >
Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
that information would be very much appreciated by us.
placeholder: >
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
When I use -ngl 0 it works correctly.
Here are the exact commands that I used: ...
validations:
required: true
- type: textarea
id: first_bad_commit
attributes:
label: First Bad Commit
description: >
If the bug was not present on an earlier version: when did it start appearing?
If possible, please do a git bisect and identify the exact commit that introduced the bug.
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: >
Please copy and paste any relevant log output, including the command that you entered and any generated text.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: true

91
.github/ISSUE_TEMPLATE/019-bug-misc.yml vendored Normal file
View file

@ -0,0 +1,91 @@
name: Bug (misc.)
description: Something is not working the way it should (and it's not covered by any of the above cases).
title: "Misc. bug: "
labels: ["bug-unconfirmed"]
body:
- type: markdown
attributes:
value: >
Thanks for taking the time to fill out this bug report!
This issue template is intended for miscellaneous bugs that don't fit into any other category.
If you encountered the issue while using an external UI (e.g. ollama),
please reproduce your issue using one of the examples/binaries in this repository.
- type: textarea
id: version
attributes:
label: Name and Version
description: Which version of our software is affected? (You can use `--version` to get a version string.)
placeholder: |
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: Operating systems
description: Which operating systems do you know to be affected?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: dropdown
id: module
attributes:
label: Which llama.cpp modules do you know to be affected?
multiple: true
options:
- Documentation/Github
- libllama (core library)
- llama-cli
- llama-server
- llama-bench
- llama-quantize
- Python/Bash scripts
- Test code
- Other (Please specify in the next section)
validations:
required: false
- type: textarea
id: command
attributes:
label: Command line
description: >
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: false
- type: textarea
id: info
attributes:
label: Problem description & steps to reproduce
description: >
Please give us a summary of the problem and tell us how to reproduce it (if applicable).
validations:
required: true
- type: textarea
id: first_bad_commit
attributes:
label: First Bad Commit
description: >
If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
If possible, please do a git bisect and identify the exact commit that introduced the bug.
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: >
If applicable, please copy and paste any relevant log output, including any generated text.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: false

View file

@ -1,50 +0,0 @@
name: Medium Severity Bug
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
title: "Bug: "
labels: ["bug-unconfirmed", "medium severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -1,5 +1,5 @@
name: Enhancement name: Enhancement
description: Used to request enhancements for llama.cpp description: Used to request enhancements for llama.cpp.
title: "Feature Request: " title: "Feature Request: "
labels: ["enhancement"] labels: ["enhancement"]
body: body:

View file

@ -1,50 +0,0 @@
name: High Severity Bug
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
title: "Bug: "
labels: ["bug-unconfirmed", "high severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -1,5 +1,5 @@
name: Research name: Research
description: Track new technical research area description: Track new technical research area.
title: "Research: " title: "Research: "
labels: ["research 🔬"] labels: ["research 🔬"]
body: body:

View file

@ -1,50 +0,0 @@
name: Critical Severity Bug
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
title: "Bug: "
labels: ["bug-unconfirmed", "critical severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./llama-cli --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -1,5 +1,5 @@
name: Refactor (Maintainers) name: Refactor (Maintainers)
description: Used to track refactoring opportunities description: Used to track refactoring opportunities.
title: "Refactor: " title: "Refactor: "
labels: ["refactor"] labels: ["refactor"]
body: body:

15
.github/labeler.yml vendored
View file

@ -3,19 +3,18 @@ Kompute:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-kompute.h - ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute.cpp - ggml/src/ggml-kompute/**
- README-kompute.md - README-kompute.md
Apple Metal: Apple Metal:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-metal.h - ggml/include/ggml-metal.h
- ggml/src/ggml-metal.cpp - ggml/src/ggml-metal/**
- README-metal.md - README-metal.md
SYCL: SYCL:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-sycl.h - ggml/include/ggml-sycl.h
- ggml/src/ggml-sycl.cpp
- ggml/src/ggml-sycl/** - ggml/src/ggml-sycl/**
- docs/backend/SYCL.md - docs/backend/SYCL.md
- examples/sycl/** - examples/sycl/**
@ -27,8 +26,8 @@ Nvidia GPU:
Vulkan: Vulkan:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/ggml_vk_generate_shaders.py - ggml/include/ggml-vulkan.h
- ggml/src/ggml-vulkan* - ggml/src/ggml-vulkan/**
documentation: documentation:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
@ -75,11 +74,7 @@ server:
ggml: ggml:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml*.h - ggml/**
- ggml/src/ggml*.c
- ggml/src/ggml*.cpp
- ggml/src/ggml*.h
- ggml-cuda/**
nix: nix:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:

View file

@ -1,7 +1 @@
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
- Self-reported review complexity:
- [ ] Low
- [ ] Medium
- [ ] High

View file

@ -55,7 +55,12 @@ jobs:
sysctl -a sysctl -a
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. cmake .. \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -113,7 +118,11 @@ jobs:
sysctl -a sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU: # Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -149,66 +158,6 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip name: llama-bin-macos-x64.zip
ubuntu-focal-make:
runs-on: ubuntu-20.04
env:
LLAMA_NODE_AVAILABLE: true
LLAMA_PYTHON_AVAILABLE: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8
- uses: actions/setup-node@v4
with:
node-version: "20"
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Build
id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
run: |
CC=gcc-8 make -j $(nproc)
- name: Test
id: make_test
run: |
CC=gcc-8 make tests -j $(nproc)
make test -j $(nproc)
ubuntu-focal-make-curl:
runs-on: ubuntu-20.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
- name: Build
id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
LLAMA_CURL: 1
run: |
CC=gcc-8 make -j $(nproc)
ubuntu-latest-cmake: ubuntu-latest-cmake:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -230,7 +179,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
- name: Test - name: Test
@ -366,7 +315,7 @@ jobs:
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y sudo apt-get update -y
sudo apt-get install -y build-essential vulkan-sdk sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -376,6 +325,12 @@ jobs:
cmake -DGGML_VULKAN=ON .. cmake -DGGML_VULKAN=ON ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-22-cmake-hip: ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2 container: rocm/dev-ubuntu-22.04:6.0.2
@ -394,15 +349,36 @@ jobs:
- name: Build with native CMake HIP support - name: Build with native CMake HIP support
id: cmake_build id: cmake_build
run: | run: |
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support - name: Build with legacy HIP support
id: cmake_build_legacy_hip id: cmake_build_legacy_hip
run: | run: |
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
cmake --build build2 --config Release -j $(nproc) cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-musa:
runs-on: ubuntu-22.04
container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
apt-get update
apt-get install -y build-essential git cmake libcurl4-openssl-dev
- name: Build with native CMake MUSA support
id: cmake_build
run: |
cmake -B build -S . -DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc)
ubuntu-22-cmake-sycl: ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
@ -485,36 +461,6 @@ jobs:
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON .. cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
macOS-latest-make:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
run: |
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
- name: Test
id: make_test
run: |
GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it. # how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
@ -569,6 +515,7 @@ jobs:
mkdir build mkdir build
cd build cd build
cmake -G Xcode .. \ cmake -G Xcode .. \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_TESTS=OFF \
@ -599,6 +546,7 @@ jobs:
mkdir build mkdir build
cd build cd build
cmake -G Xcode .. \ cmake -G Xcode .. \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_TESTS=OFF \
@ -626,15 +574,26 @@ jobs:
run: | run: |
brew update brew update
- name: Build llama.cpp with CMake
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
sudo cmake --install . --config Release
- name: xcodebuild for swift package - name: xcodebuild for swift package
id: xcodebuild id: xcodebuild
run: | run: |
xcodebuild -scheme llama -destination "${{ matrix.destination }}" xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
- name: Build Swift Example
id: make_build_swift_example
run: |
make swift
windows-msys2: windows-msys2:
runs-on: windows-latest runs-on: windows-latest
@ -661,21 +620,6 @@ jobs:
mingw-w64-${{matrix.env}}-cmake mingw-w64-${{matrix.env}}-cmake
mingw-w64-${{matrix.env}}-openblas mingw-w64-${{matrix.env}}-openblas
- name: Build using make
shell: msys2 {0}
run: |
make -j $(nproc)
- name: Clean after building using make
shell: msys2 {0}
run: |
make clean
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make GGML_OPENBLAS=1 -j $(nproc)
- name: Build using CMake - name: Build using CMake
shell: msys2 {0} shell: msys2 {0}
run: | run: |
@ -694,7 +638,7 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc) cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-2019 runs-on: windows-latest
env: env:
OPENBLAS_VERSION: 0.3.23 OPENBLAS_VERSION: 0.3.23
@ -705,23 +649,25 @@ jobs:
matrix: matrix:
include: include:
- build: 'noavx-x64' - build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
- build: 'avx2-x64' - build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
- build: 'avx-x64' - build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
- build: 'avx512-x64' - build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
- build: 'openblas-x64' - build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64' - build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
- build: 'vulkan-x64' - build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
- build: 'llvm-arm64' - build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'msvc-arm64' - build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
steps: steps:
- name: Clone - name: Clone
@ -734,7 +680,7 @@ jobs:
id: clone_kompute id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }} if: ${{ matrix.build == 'kompute-x64' }}
run: | run: |
git submodule update --init ggml/src/kompute git submodule update --init ggml/src/ggml-kompute/kompute
- name: Download OpenBLAS - name: Download OpenBLAS
id: get_openblas id: get_openblas
@ -763,6 +709,28 @@ jobs:
run: | run: |
choco install ninja choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
mkdir build && cd build
cmake .. `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build-arm64-release && cd build-arm64-release
cmake .. `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install --config release
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
@ -792,7 +760,7 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
# not all machines have native AVX-512 # not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: | run: |
cd build cd build
ctest -L main -C Release --verbose --timeout 900 ctest -L main -C Release --verbose --timeout 900
@ -837,12 +805,33 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}.zip name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda: ubuntu-latest-cmake-cuda:
runs-on: ubuntu-latest
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Install dependencies
env:
DEBIAN_FRONTEND: noninteractive
run: |
apt update
apt install -y cmake build-essential ninja-build libgomp1 git
- name: Build with CMake
run: |
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
cmake --build build
windows-2019-cmake-cuda:
runs-on: windows-2019 runs-on: windows-2019
strategy: strategy:
matrix: matrix:
cuda: ['12.2.0', '11.7.1'] cuda: ['12.4', '11.7']
build: ['cuda'] build: ['cuda']
steps: steps:
@ -850,24 +839,83 @@ jobs:
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
- name: Install CUDA toolkit - name: Install Cuda Toolkit 11.7
id: cuda-toolkit if: ${{ matrix.cuda == '11.7' }}
uses: Jimver/cuda-toolkit@v0.2.15 run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install Cuda Toolkit 12.4
if: ${{ matrix.cuda == '12.4' }}
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2
with: with:
cuda: ${{ matrix.cuda }} key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' - name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Build - name: Build
id: cmake_build id: cmake_build
shell: cmd
run: | run: |
mkdir build call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cd build cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} cmake --build build --config Release
- name: Determine tag name - name: Determine tag name
id: tag id: tag
@ -896,10 +944,12 @@ jobs:
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime - name: Copy and pack Cuda runtime
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
run: | run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" echo "Cuda install location: ${{ env.CUDA_PATH }}"
$dst='.\build\bin\cudart\' $dst='.\build\bin\cudart\'
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime - name: Upload Cuda runtime
@ -917,8 +967,8 @@ jobs:
shell: bash shell: bash
env: env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps: steps:
- name: Clone - name: Clone
@ -928,7 +978,8 @@ jobs:
fetch-depth: 0 fetch-depth: 0
- name: Install - name: Install
run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -947,25 +998,33 @@ jobs:
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi fi
- name: Pack artifacts - name: Build the release package
id: pack_artifacts id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done" echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload the release package
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
@ -996,12 +1055,17 @@ jobs:
run: | run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: ${{ github.job }}
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS} cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
windows-latest-cmake-hip-release: windows-latest-cmake-hip-release:
@ -1016,6 +1080,8 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install - name: Install
id: depends id: depends
@ -1037,7 +1103,7 @@ jobs:
run: | run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS} cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\" md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
@ -1075,6 +1141,29 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
sudo cmake --install . --config Release
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
- name: Build Xcode project - name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@ -1102,35 +1191,16 @@ jobs:
./gradlew build --no-daemon ./gradlew build --no-daemon
# freeBSD-latest:
# runs-on: macos-12
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Build
# uses: cross-platform-actions/action@v0.19.0
# with:
# operating_system: freebsd
# version: '13.2'
# hypervisor: 'qemu'
# run: |
# sudo pkg update
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: needs:
- ubuntu-focal-make
- ubuntu-latest-cmake - ubuntu-latest-cmake
- macOS-latest-make
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-cuda - windows-2019-cmake-cuda
- windows-latest-cmake-hip-release - windows-latest-cmake-hip-release
- macOS-latest-cmake-arm64 - macOS-latest-cmake-arm64
- macOS-latest-cmake-x64 - macOS-latest-cmake-x64
@ -1167,7 +1237,7 @@ jobs:
- name: Create release - name: Create release
id: create_release id: create_release
uses: anzz1/action-create-release@v1 uses: ggml-org/action-create-release@v1
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with: with:

View file

@ -10,12 +10,10 @@
name: Publish Docker image name: Publish Docker image
on: on:
#pull_request: workflow_dispatch: # allows manual triggering
push: schedule:
branches: # Rebuild daily rather than on every push because it is expensive
- master - cron: '12 4 * * *'
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
workflow_dispatch: # allows manual triggering, useful for debugging
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -29,7 +27,6 @@ permissions:
jobs: jobs:
push_to_registry: push_to_registry:
name: Push Docker image to Docker Hub name: Push Docker image to Docker Hub
#if: github.event.pull_request.draft == false
runs-on: ubuntu-latest runs-on: ubuntu-latest
env: env:
@ -37,21 +34,14 @@ jobs:
strategy: strategy:
matrix: matrix:
config: config:
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } # Multi-stage build
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v4 uses: actions/checkout@v4
@ -59,10 +49,10 @@ jobs:
fetch-depth: 0 # preserve git history, so we can determine the build number fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v2 uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx - name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2 uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub - name: Log in to Docker Hub
uses: docker/login-action@v2 uses: docker/login-action@v2
@ -82,26 +72,34 @@ jobs:
# determine tag name postfix (build number, commit hash) # determine tag name postfix (build number, commit hash)
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
TAG_POSTFIX="b${BUILD_NUMBER}" TAG_POSTFIX="-b${BUILD_NUMBER}"
else else
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-') SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}" TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
fi fi
# list all tags possible # list all tags possible
TAGS="" if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}," TYPE=""
TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}" else
TYPE="-${{ matrix.config.tag }}"
echo "output_tags=$TAGS" >> $GITHUB_OUTPUT fi
echo "output_tags=$TAGS" # print out for debugging PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
echo "full_output_tags=$FULLTAGS" # print out for debugging
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
echo "server_output_tags=$SERVERTAGS" # print out for debugging
env: env:
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
- name: Free Disk Space (Ubuntu) - name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main if: ${{ matrix.config.free_disk_space == true }}
uses: ggml-org/free-disk-space@v1.3.1
with: with:
# this might remove tools that are actually needed, # this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB # if set to "true" but frees about 6 GB
@ -116,13 +114,59 @@ jobs:
docker-images: true docker-images: true
swap-storage: true swap-storage: true
- name: Build and push Docker image (tagged + versioned) - name: Build and push Full Docker image (tagged + versioned)
if: github.event_name == 'push' if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
uses: docker/build-push-action@v6 uses: docker/build-push-action@v6
with: with:
context: . context: .
push: true push: true
platforms: ${{ matrix.config.platforms }} platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above # tag list is generated from step above
tags: ${{ steps.tag.outputs.output_tags }} tags: ${{ steps.tag.outputs.full_output_tags }}
file: ${{ matrix.config.dockerfile }} file: ${{ matrix.config.dockerfile }}
target: full
provenance: false
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
- name: Build and push Light Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.light_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: light
provenance: false
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
- name: Build and push Server Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.server_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: server
provenance: false
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache

View file

@ -23,5 +23,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: editorconfig-checker/action-editorconfig-checker@main - uses: editorconfig-checker/action-editorconfig-checker@v2
with:
version: v3.0.3
- run: editorconfig-checker - run: editorconfig-checker

View file

@ -1,72 +0,0 @@
name: Nix aarch64 builds
on:
workflow_dispatch: # allows manual triggering
schedule:
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
# 1.5h instead of minutes with the cold cache).
#
# randint(0, 59), randint(0, 23)
- cron: '26 12 * * *'
# But also rebuild if we touched any of the Nix expressions:
push:
branches:
- master
paths: ['**/*.nix', 'flake.lock']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/*.nix', 'flake.lock']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
id-token: write
contents: read
jobs:
nix-build-aarch64:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install QEMU
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
run: |
sudo apt-get update
sudo apt-get install -y qemu-user-static qemu-system-aarch64
sudo usermod -a -G kvm $USER
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: llama-cpp
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.aarch64-linux"
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--systems aarch64-linux
--flake
".#checks.aarch64-linux"

View file

@ -1,79 +0,0 @@
name: Nix CI
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
pull_request:
types: [opened, synchronize, reopened]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
id-token: write
contents: read
jobs:
nix-eval:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: List all flake outputs
run: nix flake show --all-systems
- name: Show all output paths
run: >
nix run github:nix-community/nix-eval-jobs
-- --gc-roots-dir gcroot
--flake
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
nix-build:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@v9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: |
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2
with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- name: Set-up cachix to push the results to
uses: cachix/cachix-action@v13
with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: llama-cpp
- name: Build
run: >
nix run github:Mic92/nix-fast-build
-- --skip-cached --no-nom
--flake
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"

View file

@ -1,22 +0,0 @@
name: update-flake-lock
on:
workflow_dispatch:
schedule:
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
jobs:
lockfile:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@main
- name: Update flake.lock
uses: DeterminateSystems/update-flake-lock@main
with:
pr-title: "nix: update flake.lock"
pr-labels: |
nix
pr-reviewers: philiptaron,SomeoneSerge
token: ${{ secrets.FLAKE_TOKEN }}

View file

@ -1,36 +0,0 @@
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
name: "Publish a flake to flakestry & flakehub"
on:
push:
tags:
- "*"
workflow_dispatch:
inputs:
tag:
description: "The existing tag to publish"
type: "string"
required: true
jobs:
flakestry-publish:
runs-on: ubuntu-latest
permissions:
id-token: "write"
contents: "read"
steps:
- uses: flakestry/flakestry-publish@main
with:
version: "${{ inputs.tag || github.ref_name }}"
flakehub-publish:
runs-on: "ubuntu-latest"
permissions:
id-token: "write"
contents: "read"
steps:
- uses: "actions/checkout@v4"
with:
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
- uses: "DeterminateSystems/nix-installer-action@main"
- uses: "DeterminateSystems/flakehub-push@main"
with:
visibility: "public"
tag: "${{ inputs.tag }}"

View file

@ -1,6 +1,13 @@
name: flake8 Lint name: flake8 Lint
on: [push, pull_request] on:
push:
branches:
- master
paths: ['.github/workflows/python-lint.yml', '**/*.py']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/python-lint.yml', '**/*.py']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

View file

@ -76,20 +76,26 @@ jobs:
run: | run: |
pip install -r examples/server/tests/requirements.txt pip install -r examples/server/tests/requirements.txt
- name: Verify server deps # Setup nodejs (to be used for verifying bundled index.html)
id: verify_server_deps - uses: actions/setup-node@v4
with:
node-version: '22.11.0'
- name: Verify bundled index.html
id: verify_server_index_html
run: | run: |
git config --global --add safe.directory $(realpath .) git config --global --add safe.directory $(realpath .)
cd examples/server cd examples/server/webui
git ls-files --others --modified
git status git status
./deps.sh npm ci
npm run build
git status git status
not_ignored_files="$(git ls-files --others --modified)" modified_files="$(git status -s)"
echo "Modified files: ${not_ignored_files}" echo "Modified files: ${modified_files}"
if [ -n "${not_ignored_files}" ]; then if [ -n "${modified_files}" ]; then
echo "Repository is dirty or server deps are not built as expected" echo "Repository is dirty or server/webui is not built as expected"
echo "${not_ignored_files}" echo "Hint: You may need to follow Web UI build guide in server/README.md"
echo "${modified_files}"
exit 1 exit 1
fi fi
@ -122,14 +128,14 @@ jobs:
id: server_integration_tests id: server_integration_tests
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh ./tests.sh
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow SLOW_TESTS=1 ./tests.sh
server-windows: server-windows:
@ -180,11 +186,12 @@ jobs:
run: | run: |
cd examples/server/tests cd examples/server/tests
$env:PYTHONIOENCODING = ":replace" $env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp pytest -v -x
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
behave.exe --stop --no-skipped --no-capture --tags slow $env:SLOW_TESTS = "1"
pytest -v -x

10
.gitignore vendored
View file

@ -3,6 +3,7 @@
*.a *.a
*.bat *.bat
*.bin *.bin
*.d
*.dll *.dll
*.dot *.dot
*.etag *.etag
@ -17,6 +18,7 @@
*.metallib *.metallib
*.o *.o
*.so *.so
*.swp
*.tmp *.tmp
# IDE / OS # IDE / OS
@ -103,6 +105,10 @@ examples/server/*.mjs.hpp
!examples/sycl/*.bat !examples/sycl/*.bat
!examples/sycl/*.sh !examples/sycl/*.sh
# Server Web UI temporary files
node_modules
examples/server/webui/dist
# Python # Python
/.venv /.venv
@ -133,3 +139,7 @@ poetry.toml
# Test models for lora adapters # Test models for lora adapters
/lora-tests /lora-tests
# Local scripts
/run-vim.sh
/run-chat.sh

2
.gitmodules vendored
View file

@ -1,3 +1,3 @@
[submodule "kompute"] [submodule "kompute"]
path = ggml/src/kompute path = ggml/src/ggml-kompute/kompute
url = https://github.com/nomic-ai/kompute.git url = https://github.com/nomic-ai/kompute.git

186
AUTHORS
View file

@ -1,4 +1,4 @@
# date: Wed Jun 26 19:36:34 EEST 2024 # date: Thu Nov 28 20:46:15 EET 2024
# this file is auto-generated by scripts/gen-authors.sh # this file is auto-generated by scripts/gen-authors.sh
0cc4m <picard12@live.de> 0cc4m <picard12@live.de>
@ -7,6 +7,7 @@
2f38b454 <dxf@protonmail.com> 2f38b454 <dxf@protonmail.com>
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
44670 <44670@users.noreply.github.com> 44670 <44670@users.noreply.github.com>
65a <10104049+65a@users.noreply.github.com>
AN Long <aisk@users.noreply.github.com> AN Long <aisk@users.noreply.github.com>
AT <manyoso@users.noreply.github.com> AT <manyoso@users.noreply.github.com>
Aarni Koskela <akx@iki.fi> Aarni Koskela <akx@iki.fi>
@ -19,20 +20,28 @@ Adithya Balaji <adithya.b94@gmail.com>
AdithyanI <adithyan.i4internet@gmail.com> AdithyanI <adithyan.i4internet@gmail.com>
Adrian <smith.adriane@gmail.com> Adrian <smith.adriane@gmail.com>
Adrian Hesketh <a-h@users.noreply.github.com> Adrian Hesketh <a-h@users.noreply.github.com>
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr> Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
AidanBeltonS <aidan.belton@codeplay.com>
Aisuko <urakiny@gmail.com> Aisuko <urakiny@gmail.com>
Akarshan Biswas <akarshan.biswas@gmail.com>
Akarshan Biswas <akarshanbiswas@fedoraproject.org> Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Al Mochkin <14274697+amochkin@users.noreply.github.com>
Albert Jin <albert.jin@gmail.com> Albert Jin <albert.jin@gmail.com>
Alberto <57916483+albbus-stack@users.noreply.github.com> Alberto <57916483+albbus-stack@users.noreply.github.com>
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
Alex <awhill19@icloud.com> Alex <awhill19@icloud.com>
Alex Azarov <alex@azarov.by> Alex Azarov <alex@azarov.by>
Alex Azarov <alexander.azarov@mapbox.com> Alex Azarov <alexander.azarov@mapbox.com>
Alex Klinkhamer <from.github.com.917@grencez.dev> Alex Klinkhamer <from.github.com.917@grencez.dev>
Alex Klinkhamer <git@grencez.dev> Alex Klinkhamer <git@grencez.dev>
Alex Nguyen <tiendung@users.noreply.github.com> Alex Nguyen <tiendung@users.noreply.github.com>
Alex O'Connell <35843486+acon96@users.noreply.github.com>
Alex Petenchea <alex.petenchea@gmail.com> Alex Petenchea <alex.petenchea@gmail.com>
Alex Renda <alexrenda@users.noreply.github.com> Alex Renda <alexrenda@users.noreply.github.com>
Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
Alex von Gluck IV <kallisti5@unixzen.com> Alex von Gluck IV <kallisti5@unixzen.com>
Alexey Parfenov <zxed@alkatrazstudio.net> Alexey Parfenov <zxed@alkatrazstudio.net>
Ali Chraghi <63465728+alichraghi@users.noreply.github.com> Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
Ananta Bastola <anantarajbastola@gmail.com> Ananta Bastola <anantarajbastola@gmail.com>
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
András Salamon <ott2@users.noreply.github.com> András Salamon <ott2@users.noreply.github.com>
Andreas (Andi) Kunar <andreask@msn.com>
Andrei <abetlen@gmail.com> Andrei <abetlen@gmail.com>
Andrew Canis <andrew.canis@gmail.com> Andrew Canis <andrew.canis@gmail.com>
Andrew Downing <andrew2085@gmail.com> Andrew Downing <andrew2085@gmail.com>
Andrew Duffy <a10y@users.noreply.github.com> Andrew Duffy <a10y@users.noreply.github.com>
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com> Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
Andy Salerno <andysalerno@gmail.com>
Andy Tai <andy-tai@users.noreply.github.com> Andy Tai <andy-tai@users.noreply.github.com>
Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
Antonis Makropoulos <benuix@gmail.com>
Arik Poznanski <arikpoz@users.noreply.github.com> Arik Poznanski <arikpoz@users.noreply.github.com>
Armen Kaleshian <kriation@users.noreply.github.com>
Artem <guinmoon@gmail.com> Artem <guinmoon@gmail.com>
Artem Zinnatullin <ceo@abstractny.gay> Artem Zinnatullin <ceo@abstractny.gay>
Artyom Lebedev <vagran.ast@gmail.com> Artyom Lebedev <vagran.ast@gmail.com>
Asbjørn Olling <asbjornolling@gmail.com> Asbjørn Olling <asbjornolling@gmail.com>
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org> Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
Ashish <1856117+ashishdatta@users.noreply.github.com> Ashish <1856117+ashishdatta@users.noreply.github.com>
Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
Ashraful Islam <ashraful.meche@gmail.com> Ashraful Islam <ashraful.meche@gmail.com>
@ -76,12 +92,16 @@ Ben Williams <ben@719ben.com>
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
Bernat Vadell <hounter.caza@gmail.com> Bernat Vadell <hounter.caza@gmail.com>
Bert Wagner <github@bertwagner.com>
Bingan <70050083+binganao@users.noreply.github.com> Bingan <70050083+binganao@users.noreply.github.com>
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
Bodo Graumann <mail@bodograumann.de> Bodo Graumann <mail@bodograumann.de>
Bono Lv <lvscar@users.noreply.github.com> Bono Lv <lvscar@users.noreply.github.com>
Borislav Stanimirov <b.stanimirov@abv.bg> Borislav Stanimirov <b.stanimirov@abv.bg>
Branden Butler <bwtbutler@hotmail.com> Branden Butler <bwtbutler@hotmail.com>
Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
Brian <mofosyne@gmail.com> Brian <mofosyne@gmail.com>
Brian Cunnie <brian.cunnie@gmail.com>
Bruce MacDonald <brucewmacdonald@gmail.com> Bruce MacDonald <brucewmacdonald@gmail.com>
Bryan Honof <bryanhonof@gmail.com> Bryan Honof <bryanhonof@gmail.com>
CJ Pais <cj@cjpais.com> CJ Pais <cj@cjpais.com>
@ -90,32 +110,47 @@ Calvin Laurenson <calvin@laurenson.dev>
Cameron <csteele@steelecameron.com> Cameron <csteele@steelecameron.com>
Cameron Kaiser <classilla@users.noreply.github.com> Cameron Kaiser <classilla@users.noreply.github.com>
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com> Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
CarryFun <76023481+CarryFun@users.noreply.github.com>
Carsten Kragelund Jørgensen <carsten@kragelund.me>
CarterLi999 <664681047@qq.com>
Casey Primozic <casey@cprimozic.net> Casey Primozic <casey@cprimozic.net>
Casey Primozic <me@ameo.link> Casey Primozic <me@ameo.link>
CausalLM <148736309+CausalLM@users.noreply.github.com> CausalLM <148736309+CausalLM@users.noreply.github.com>
Cebtenzzre <cebtenzzre@gmail.com> Cebtenzzre <cebtenzzre@gmail.com>
Chad Brewbaker <crb002@gmail.com> Chad Brewbaker <crb002@gmail.com>
Changyeon Kim <cyzero.kim@samsung.com>
Chao Jiang <jc19chaoj@zoho.com> Chao Jiang <jc19chaoj@zoho.com>
Charles Xu <63788048+chaxu01@users.noreply.github.com>
Charles Xu <charles.xu@arm.com>
Chen Xi <xi2.chen@intel.com>
Chen Xi <xixichen08@foxmail.com>
Cheng Shao <terrorjack@type.dance> Cheng Shao <terrorjack@type.dance>
Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Chris Elrod <elrodc@gmail.com> Chris Elrod <elrodc@gmail.com>
Chris Kuehl <ckuehl@ckuehl.me> Chris Kuehl <ckuehl@ckuehl.me>
Christian Demsar <christian@github.email.demsar.us> Christian Demsar <christian@github.email.demsar.us>
Christian Demsar <crasm@git.vczf.us> Christian Demsar <crasm@git.vczf.us>
Christian Falch <875252+chrfalch@users.noreply.github.com> Christian Falch <875252+chrfalch@users.noreply.github.com>
Christian Kögler <ck3d@gmx.de> Christian Kögler <ck3d@gmx.de>
Christian Köhnenkamp <cvk5@me.com>
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
Clark Saben <76020733+csaben@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com>
Clint Herron <hanclinto@gmail.com> Clint Herron <hanclinto@gmail.com>
Conrad Kramer <conrad@conradkramer.com>
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
Cuong Trinh Manh <nguoithichkhampha@gmail.com> Cuong Trinh Manh <nguoithichkhampha@gmail.com>
DAN™ <dranger003@gmail.com> DAN™ <dranger003@gmail.com>
Damian Stewart <d@damianstewart.com> Damian Stewart <d@damianstewart.com>
Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
Dan Johansson <dan.johansson@arm.com>
Dane Madsen <dane_madsen@hotmail.com> Dane Madsen <dane_madsen@hotmail.com>
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
Daniel Bevenius <daniel.bevenius@gmail.com> Daniel Bevenius <daniel.bevenius@gmail.com>
Daniel Drake <drake@endlessos.org> Daniel Drake <drake@endlessos.org>
Daniel Hiltgen <dhiltgen@users.noreply.github.com> Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Daniel Illescas Romero <illescas.daniel@protonmail.com> Daniel Illescas Romero <illescas.daniel@protonmail.com>
Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Daniele <57776841+daniandtheweb@users.noreply.github.com> Daniele <57776841+daniandtheweb@users.noreply.github.com>
DannyDaemonic <DannyDaemonic@gmail.com> DannyDaemonic <DannyDaemonic@gmail.com>
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
@ -129,19 +164,28 @@ David Pflug <david@pflug.email>
David Renshaw <dwrenshaw@gmail.com> David Renshaw <dwrenshaw@gmail.com>
David Sommers <12738+databyte@users.noreply.github.com> David Sommers <12738+databyte@users.noreply.github.com>
David Yang <davidyang6us@gmail.com> David Yang <davidyang6us@gmail.com>
DavidKorczynski <david@adalogics.com>
Dawid Potocki <github@dawidpotocki.com> Dawid Potocki <github@dawidpotocki.com>
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
Dean <Dean.Sinaean@gmail.com> Dean <Dean.Sinaean@gmail.com>
Deins <deinsegle@gmail.com> Deins <deinsegle@gmail.com>
Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
Derrick T. Woolworth <dwoolworth@gmail.com>
Deven Mistry <31466137+deven367@users.noreply.github.com> Deven Mistry <31466137+deven367@users.noreply.github.com>
Dibakar Gope <dibakar.gope@arm.com>
Didzis Gosko <didzis@users.noreply.github.com> Didzis Gosko <didzis@users.noreply.github.com>
Diego Devesa <slarengh@gmail.com>
Diogo Teles Sant'Anna <diogoteles@google.com>
Djip007 <djip.perois@free.fr> Djip007 <djip.perois@free.fr>
Don Mahurin <dmahurin@users.noreply.github.com> Don Mahurin <dmahurin@users.noreply.github.com>
DooWoong Lee (David) <manics99@naver.com> DooWoong Lee (David) <manics99@naver.com>
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
Dou Xinpeng <15529241576@163.com>
Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
Douglas Hanley <thesecretaryofwar@gmail.com> Douglas Hanley <thesecretaryofwar@gmail.com>
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
Ebey Abraham <ebey97@gmail.com> Ebey Abraham <ebey97@gmail.com>
Echo Nolan <echo@echonolan.net>
Ed Lee <edilee@mozilla.com> Ed Lee <edilee@mozilla.com>
Ed Lepedus <ed.lepedus@googlemail.com> Ed Lepedus <ed.lepedus@googlemail.com>
Eddie-Wang <wangjinheng1120@163.com> Eddie-Wang <wangjinheng1120@163.com>
@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com>
Elton Kola <eltonkola@gmail.com> Elton Kola <eltonkola@gmail.com>
Engininja2 <139037756+Engininja2@users.noreply.github.com> Engininja2 <139037756+Engininja2@users.noreply.github.com>
Equim <sayaka@ekyu.moe> Equim <sayaka@ekyu.moe>
Eric Curtin <ecurtin@redhat.com>
Eric Curtin <ericcurtin17@gmail.com>
Eric Sommerlade <es0m@users.noreply.github.com> Eric Sommerlade <es0m@users.noreply.github.com>
Eric Zhang <34133756+EZForever@users.noreply.github.com> Eric Zhang <34133756+EZForever@users.noreply.github.com>
Erik Garrison <erik.garrison@gmail.com> Erik Garrison <erik.garrison@gmail.com>
Erik Scholz <Green-Sky@users.noreply.github.com> Erik Scholz <Green-Sky@users.noreply.github.com>
Esko Toivonen <eskot98@gmail.com>
Ettore Di Giacinto <mudler@users.noreply.github.com> Ettore Di Giacinto <mudler@users.noreply.github.com>
Evan Jones <evan.q.jones@gmail.com> Evan Jones <evan.q.jones@gmail.com>
Evan Miller <emmiller@gmail.com> Evan Miller <emmiller@gmail.com>
@ -166,19 +213,26 @@ FK <sozforex@gmail.com>
Fabian <cmdrf@users.noreply.github.com> Fabian <cmdrf@users.noreply.github.com>
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com> Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
Faez Shakil <faez.shakil@gmail.com> Faez Shakil <faez.shakil@gmail.com>
Faisal Zaghloul <faisal.zaghloul@gmail.com>
Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Fan Shupei <dymarkfan@outlook.com>
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com> FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
Fattire <528174+fat-tire@users.noreply.github.com> Fattire <528174+fat-tire@users.noreply.github.com>
Felix <stenbackfelix@gmail.com> Felix <stenbackfelix@gmail.com>
Finn Voorhees <finnvoorhees@gmail.com> Finn Voorhees <finnvoorhees@gmail.com>
Firat <firatkiral@gmail.com> Firat <firatkiral@gmail.com>
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
Francisco Melo <43780565+francis2tm@users.noreply.github.com> Francisco Melo <43780565+francis2tm@users.noreply.github.com>
Frank Mai <thxcode0824@gmail.com> Frank Mai <thxcode0824@gmail.com>
FrankHB <frankhb1989@gmail.com> FrankHB <frankhb1989@gmail.com>
Frankie Robertson <frankier@users.noreply.github.com>
Fred Douglas <43351173+fredlas@users.noreply.github.com> Fred Douglas <43351173+fredlas@users.noreply.github.com>
Frederik Vogel <Schaltfehler@users.noreply.github.com> Frederik Vogel <Schaltfehler@users.noreply.github.com>
Gabe Goodhart <gabe.l.hart@gmail.com> Gabe Goodhart <gabe.l.hart@gmail.com>
Gabe Goodhart <ghart@us.ibm.com>
GainLee <perfecter.gen@gmail.com> GainLee <perfecter.gen@gmail.com>
Galunid <karolek1231456@gmail.com> Galunid <karolek1231456@gmail.com>
Gary Linscott <glinscott@gmail.com> Gary Linscott <glinscott@gmail.com>
@ -187,11 +241,13 @@ Gavin Zhao <gavinzhaojw@protonmail.com>
Genkagaku.GPT <hlhr202@163.com> Genkagaku.GPT <hlhr202@163.com>
Georgi Gerganov <ggerganov@gmail.com> Georgi Gerganov <ggerganov@gmail.com>
Gilad S <giladgd@users.noreply.github.com> Gilad S <giladgd@users.noreply.github.com>
Gilad S. <7817232+giladgd@users.noreply.github.com>
Giuseppe Scrivano <giuseppe@scrivano.org> Giuseppe Scrivano <giuseppe@scrivano.org>
GiviMAD <GiviMAD@users.noreply.github.com> GiviMAD <GiviMAD@users.noreply.github.com>
Govlzkoy <gotope@users.noreply.github.com> Govlzkoy <gotope@users.noreply.github.com>
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com> Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
Guillaume Wenzek <gwenzek@users.noreply.github.com> Guillaume Wenzek <gwenzek@users.noreply.github.com>
Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
Haggai Nuchi <h.nuchi@gmail.com> Haggai Nuchi <h.nuchi@gmail.com>
@ -213,11 +269,14 @@ Hong Bo PENG <penghb@cn.ibm.com>
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
Howard Su <howard0su@gmail.com> Howard Su <howard0su@gmail.com>
Hua Jiang <allenhjiang@outlook.com> Hua Jiang <allenhjiang@outlook.com>
Huang Qi <huangqi3@xiaomi.com>
Huawei Lin <huaweilin.cs@gmail.com> Huawei Lin <huaweilin.cs@gmail.com>
Hugo Roussel <hugo.rous@gmail.com> Hugo Roussel <hugo.rous@gmail.com>
Huifeng Ou <79071290+ho2103@users.noreply.github.com>
Ian Bull <irbull@eclipsesource.com> Ian Bull <irbull@eclipsesource.com>
Ian Bull <irbull@gmail.com> Ian Bull <irbull@gmail.com>
Ian Scrivener <github@zilogy.asia> Ian Scrivener <github@zilogy.asia>
Icecream95 <the.real.icecream95@gmail.com>
Ido S <ido.pluto@gmail.com> Ido S <ido.pluto@gmail.com>
IgnacioFDM <ignaciofdm@gmail.com> IgnacioFDM <ignaciofdm@gmail.com>
Igor Okulist <okigan@gmail.com> Igor Okulist <okigan@gmail.com>
@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Ionoclast Laboratories <brigham@ionoclast.com> Ionoclast Laboratories <brigham@ionoclast.com>
Isaac McFadyen <isaac@imcf.me> Isaac McFadyen <isaac@imcf.me>
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
Ivan <nekotekina@gmail.com>
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Ivan Komarov <Ivan.Komarov@dfyz.info> Ivan Komarov <Ivan.Komarov@dfyz.info>
Ivan Stepanov <ivanstepanovftw@gmail.com> Ivan Stepanov <ivanstepanovftw@gmail.com>
JH23X <165871467+JH23X@users.noreply.github.com> JH23X <165871467+JH23X@users.noreply.github.com>
Jack Mousseau <jack@software.inc>
Jack Mousseau <jmousseau@users.noreply.github.com> Jack Mousseau <jmousseau@users.noreply.github.com>
JackJollimore <130917767+JackJollimore@users.noreply.github.com> JackJollimore <130917767+JackJollimore@users.noreply.github.com>
Jaeden Amero <jaeden@patater.com>
Jaemin Son <woalsdnd@gmail.com> Jaemin Son <woalsdnd@gmail.com>
Jag Chadha <jagtesh@gmail.com> Jag Chadha <jagtesh@gmail.com>
Jakub N <jakubniemczyk97@gmail.com> Jakub N <jakubniemczyk97@gmail.com>
@ -243,10 +306,14 @@ Jannis Schönleber <joennlae@gmail.com>
Jared Van Bortel <cebtenzzre@gmail.com> Jared Van Bortel <cebtenzzre@gmail.com>
Jared Van Bortel <jared@nomic.ai> Jared Van Bortel <jared@nomic.ai>
Jason McCartney <jmac@theroot.org> Jason McCartney <jmac@theroot.org>
Jason Stillerman <jason.t.stillerman@gmail.com>
Jean-Christophe Hoelt <hoelt@fovea.cc> Jean-Christophe Hoelt <hoelt@fovea.cc>
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com> Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
Jed Fox <git@jedfox.com> Jed Fox <git@jedfox.com>
Jeff Bolz <jbolz@nvidia.com>
Jeffrey Morgan <jmorganca@gmail.com>
Jeffrey Quesnelle <emozilla@nousresearch.com> Jeffrey Quesnelle <emozilla@nousresearch.com>
Jeroen Mostert <jeroen.mostert@cm.com>
Jesse Jojo Johnson <williamsaintgeorge@gmail.com> Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
Jeximo <jeximo@gmail.com> Jeximo <jeximo@gmail.com>
Jhen-Jie Hong <iainst0409@gmail.com> Jhen-Jie Hong <iainst0409@gmail.com>
@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
Jiří Sejkora <Sejseloid@gmail.com> Jiří Sejkora <Sejseloid@gmail.com>
Joan Fontanals <jfontanalsmartinez@gmail.com> Joan Fontanals <jfontanalsmartinez@gmail.com>
Joan Fontanals <joan.fontanals.martinez@jina.ai> Joan Fontanals <joan.fontanals.martinez@jina.ai>
João Dinis Ferreira <hello@joaof.eu>
Joe Eli McIlvain <joe.eli.mac@gmail.com>
Joe Todd <joe.todd@codeplay.com>
Johan <JohanAR@users.noreply.github.com> Johan <JohanAR@users.noreply.github.com>
Johannes Gäßler <johannesg@5d6.de> Johannes Gäßler <johannesg@5d6.de>
Johannes Rudolph <johannes.rudolph@gmail.com> Johannes Rudolph <johannes.rudolph@gmail.com>
@ -274,7 +344,9 @@ Joyce <joycebrum@google.com>
Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
Judd <foldl@users.noreply.github.com> Judd <foldl@users.noreply.github.com>
Julius Arkenberg <arki05@users.noreply.github.com> Julius Arkenberg <arki05@users.noreply.github.com>
Jun Hee Yoo <contact.jhyoo@gmail.com>
Jun Jie <71215065+junnjiee16@users.noreply.github.com> Jun Jie <71215065+junnjiee16@users.noreply.github.com>
Junil Kim <logyourself@gmail.com>
Junyang Lin <justinlin930319@hotmail.com> Junyang Lin <justinlin930319@hotmail.com>
Juraj Bednar <juraj@bednar.io> Juraj Bednar <juraj@bednar.io>
Justin Parker <jparkerweb@gmail.com> Justin Parker <jparkerweb@gmail.com>
@ -292,12 +364,14 @@ Karthik Sethuraman <k.seth1993@gmail.com>
Kasumi <90275229+kasumi-1@users.noreply.github.com> Kasumi <90275229+kasumi-1@users.noreply.github.com>
Kawrakow <48489457+ikawrakow@users.noreply.github.com> Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Keiichi Tabata <keiichi.tabata@outlook.com> Keiichi Tabata <keiichi.tabata@outlook.com>
Keke Han <hankeke303@163.com>
Kenvix ⭐ <kenvixzure@live.com> Kenvix ⭐ <kenvixzure@live.com>
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Kevin Gibbons <bakkot@gmail.com> Kevin Gibbons <bakkot@gmail.com>
Kevin Ji <1146876+kevinji@users.noreply.github.com> Kevin Ji <1146876+kevinji@users.noreply.github.com>
Kevin Kwok <antimatter15@gmail.com> Kevin Kwok <antimatter15@gmail.com>
Kevin Lo <kevlo@kevlo.org> Kevin Lo <kevlo@kevlo.org>
Kevin Wang <kevmo314@gmail.com>
Kolen Cheung <ickc@users.noreply.github.com> Kolen Cheung <ickc@users.noreply.github.com>
Konstantin Herud <konstantin.herud@denkbares.com> Konstantin Herud <konstantin.herud@denkbares.com>
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com> Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
Leonardo Neumann <leonardo@neumann.dev.br> Leonardo Neumann <leonardo@neumann.dev.br>
Li Tan <tanliboy@gmail.com> Li Tan <tanliboy@gmail.com>
Linwei Wang <wanix1988@gmail.com> Linwei Wang <wanix1988@gmail.com>
Liu Jia <109258120+Septa2112@users.noreply.github.com>
Liu Jia <jia3.liu@intel.com>
LoganDark <github@logandark.mozmail.com> LoganDark <github@logandark.mozmail.com>
Loïc Carrère <loic.carrere@gmail.com>
LostRuins <39025047+LostRuins@users.noreply.github.com> LostRuins <39025047+LostRuins@users.noreply.github.com>
Luciano <lucianostrika44@gmail.com> Luciano <lucianostrika44@gmail.com>
Luo Tian <lt@basecity.com> Luo Tian <lt@basecity.com>
Lyle Dean <dean@lyle.dev> Lyle Dean <dean@lyle.dev>
M-A <maruel@gmail.com>
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com> M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
Ma Mingfei <mingfei.ma@intel.com>
Maarten ter Huurne <maarten@treewalker.org> Maarten ter Huurne <maarten@treewalker.org>
Mack Straight <eiz@users.noreply.github.com> Mack Straight <eiz@users.noreply.github.com>
Maël Kerbiriou <m431.kerbiriou@gmail.com> Maël Kerbiriou <m431.kerbiriou@gmail.com>
MaggotHATE <clay1326@gmail.com> MaggotHATE <clay1326@gmail.com>
Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
Manuel <44313466+makuche@users.noreply.github.com> Manuel <44313466+makuche@users.noreply.github.com>
Marc Köhlbrugge <subscriptions@marckohlbrugge.com> Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
Marco Matthies <71844+marcom@users.noreply.github.com> Marco Matthies <71844+marcom@users.noreply.github.com>
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Marian Cepok <marian.cepok@gmail.com> Marian Cepok <marian.cepok@gmail.com>
Mark Fairbairn <thebaron88@gmail.com> Mark Fairbairn <thebaron88@gmail.com>
Mark Zhuang <zhuangqiubin@gmail.com>
Marko Tasic <mtasic85@gmail.com> Marko Tasic <mtasic85@gmail.com>
Markus Tavenrath <mtavenrath@users.noreply.github.com> Markus Tavenrath <mtavenrath@users.noreply.github.com>
Martin Delille <martin@delille.org> Martin Delille <martin@delille.org>
@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com> Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
Matheus C. França <matheus-catarino@hotmail.com> Matheus C. França <matheus-catarino@hotmail.com>
Matheus Gabriel Alves Silva <matheusgasource@gmail.com> Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
Mathieu Geli <mathieu.geli@gmail.com>
Mathieu Nayrolles <MathieuNls@users.noreply.github.com> Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
Mathijs Henquet <mathijs.henquet@gmail.com>
Mathijs de Bruin <mathijs@mathijsfietst.nl> Mathijs de Bruin <mathijs@mathijsfietst.nl>
Matt Clayton <156335168+mattjcly@users.noreply.github.com> Matt Clayton <156335168+mattjcly@users.noreply.github.com>
Matt Pulver <matt.pulver@heavy.ai> Matt Pulver <matt.pulver@heavy.ai>
Matt Stephenson <mstephenson6@users.noreply.github.com>
Matteo Boschini <12133566+mbosc@users.noreply.github.com> Matteo Boschini <12133566+mbosc@users.noreply.github.com>
Matteo Mortari <matteo.mortari@gmail.com>
Mattheus Chediak <shammcity00@gmail.com> Mattheus Chediak <shammcity00@gmail.com>
Matthew Tejo <matthew.tejo@gmail.com> Matthew Tejo <matthew.tejo@gmail.com>
Matvey Soloviev <blackhole89@gmail.com> Matvey Soloviev <blackhole89@gmail.com>
@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com>
Maximilian Winter <maximilian.winter.91@gmail.com> Maximilian Winter <maximilian.winter.91@gmail.com>
Meng Zhang <meng@tabbyml.com> Meng Zhang <meng@tabbyml.com>
Meng, Hengyu <hengyu.meng@intel.com> Meng, Hengyu <hengyu.meng@intel.com>
Mengqing Cao <cmq0113@163.com>
Merrick Christensen <merrick.christensen@gmail.com> Merrick Christensen <merrick.christensen@gmail.com>
Michael Coppola <m18coppola@gmail.com> Michael Coppola <m18coppola@gmail.com>
Michael Francis <edude03@gmail.com>
Michael Hueschen <m@mhueschen.dev> Michael Hueschen <m@mhueschen.dev>
Michael Kesper <mkesper@schokokeks.org> Michael Kesper <mkesper@schokokeks.org>
Michael Klimenko <mklimenko29@gmail.com> Michael Klimenko <mklimenko29@gmail.com>
@ -365,41 +452,57 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
Michael Potter <NanoTekGuy@Gmail.com> Michael Potter <NanoTekGuy@Gmail.com>
Michael de Gans <michael.john.degans@gmail.com> Michael de Gans <michael.john.degans@gmail.com>
Michaël de Vries <vriesdemichael@gmail.com> Michaël de Vries <vriesdemichael@gmail.com>
Michał Tuszyński <srgtuszy@gmail.com>
Mihai <mihai.chirculescu@yahoo.com> Mihai <mihai.chirculescu@yahoo.com>
Mike <ytianhui2004@gmail.com> Mike <ytianhui2004@gmail.com>
Mikko Juola <mikjuo@gmail.com> Mikko Juola <mikjuo@gmail.com>
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
Minsoo Cheong <icycle0409@snu.ac.kr>
Mirko185 <mirkosig@gmail.com> Mirko185 <mirkosig@gmail.com>
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
MistApproach <98988043+MistApproach@users.noreply.github.com>
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com> Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com> Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
Molly Sophia <mollysophia379@gmail.com>
MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
Murilo Santana <mvrilo@gmail.com> Murilo Santana <mvrilo@gmail.com>
Musab Gultekin <musabgultekin@users.noreply.github.com> Musab Gultekin <musabgultekin@users.noreply.github.com>
Nam D. Tran <42194884+namtranase@users.noreply.github.com> Nam D. Tran <42194884+namtranase@users.noreply.github.com>
Nathan Epstein <nate2@umbc.edu> Nathan Epstein <nate2@umbc.edu>
Natsu <chino@hotococoa.moe>
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
Nebula <infinitewormhole@gmail.com> Nebula <infinitewormhole@gmail.com>
Neo Zhang <14088817+arthw@users.noreply.github.com> Neo Zhang <14088817+arthw@users.noreply.github.com>
Neo Zhang <zhang.jianyu@outlook.com> Neo Zhang <zhang.jianyu@outlook.com>
Neo Zhang Jianyu <jianyu.zhang@intel.com> Neo Zhang Jianyu <jianyu.zhang@intel.com>
Neuman Vong <neuman.vong@gmail.com> Neuman Vong <neuman.vong@gmail.com>
Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
Nexesenex <124105151+Nexesenex@users.noreply.github.com> Nexesenex <124105151+Nexesenex@users.noreply.github.com>
Niall Coates <1349685+Niall-@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com>
Nicholai Tukanov <nicholaitukanov@gmail.com>
Nico Bosshard <nico@bosshome.ch>
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de> Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
Nicolás Pérez <nicolas_perez@brown.edu> Nicolás Pérez <nicolas_perez@brown.edu>
Nigel Bosch <pnigelb@gmail.com> Nigel Bosch <pnigelb@gmail.com>
Niklas Korz <niklas@niklaskorz.de> Niklas Korz <niklas@niklaskorz.de>
NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
Nikolas <127742645+nneubacher@users.noreply.github.com> Nikolas <127742645+nneubacher@users.noreply.github.com>
Nindaleth <Nindaleth@users.noreply.github.com> Nindaleth <Nindaleth@users.noreply.github.com>
OSecret <135510162+OLSecret@users.noreply.github.com>
Oleksandr Nikitin <oleksandr@tvori.info> Oleksandr Nikitin <oleksandr@tvori.info>
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com> Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
Olivier Chafik <ochafik@users.noreply.github.com> Olivier Chafik <ochafik@users.noreply.github.com>
Ondřej Čertík <ondrej@certik.us> Ondřej Čertík <ondrej@certik.us>
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com> Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
PAB <pierreantoine.bannier@gmail.com>
Pablo Duboue <pablo.duboue@gmail.com>
Pascal Patry <ppatry@mtacitlabs.com>
Patrice Ferlet <metal3d@gmail.com> Patrice Ferlet <metal3d@gmail.com>
Paul Tsochantaris <ptsochantaris@icloud.com> Paul Tsochantaris <ptsochantaris@icloud.com>
Pavel Zloi <github.com@drteam.rocks>
Pavol Rusnak <pavol@rusnak.io> Pavol Rusnak <pavol@rusnak.io>
Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
Pedro Cuenca <pedro@huggingface.co> Pedro Cuenca <pedro@huggingface.co>
Peter Sugihara <peter@campsh.com> Peter Sugihara <peter@campsh.com>
Phil H <5756783+phiharri@users.noreply.github.com> Phil H <5756783+phiharri@users.noreply.github.com>
@ -407,10 +510,15 @@ Philip Taron <philip.taron@gmail.com>
Phillip Kravtsov <phillip@kravtsov.net> Phillip Kravtsov <phillip@kravtsov.net>
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com> Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
Pierrick Hymbert <pierrick.hymbert@gmail.com> Pierrick Hymbert <pierrick.hymbert@gmail.com>
Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
Plamen Minev <pacominev@gmail.com>
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
Przemysław Pawełczyk <przemoc@gmail.com> Przemysław Pawełczyk <przemoc@gmail.com>
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
Qingyou Meng <meng.qingyou@gmail.com> Qingyou Meng <meng.qingyou@gmail.com>
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com> Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
R0CKSTAR <xiaodong.ye@mthreads.com>
R0CKSTAR <yeahdongcn@gmail.com>
RJ Adriaansen <adriaansen@eshcc.eur.nl> RJ Adriaansen <adriaansen@eshcc.eur.nl>
Radoslav Gerganov <rgerganov@gmail.com> Radoslav Gerganov <rgerganov@gmail.com>
Radosław Gryta <radek.gryta@gmail.com> Radosław Gryta <radek.gryta@gmail.com>
@ -419,11 +527,13 @@ Raj Hammeer Singh Hada <hammeerraj@gmail.com>
Ralph Soika <ralph.soika@imixs.com> Ralph Soika <ralph.soika@imixs.com>
Rand Xie <randxiexyy29@gmail.com> Rand Xie <randxiexyy29@gmail.com>
Randall Fitzgerald <randall@dasaku.net> Randall Fitzgerald <randall@dasaku.net>
Random Fly <renfei8@live.cn>
Reinforce-II <fate@eastal.com> Reinforce-II <fate@eastal.com>
Ren Xuancheng <jklj077@users.noreply.github.com> Ren Xuancheng <jklj077@users.noreply.github.com>
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
RhinoDevel <RhinoDevel@users.noreply.github.com> RhinoDevel <RhinoDevel@users.noreply.github.com>
Riceball LEE <snowyu.lee@gmail.com> Riceball LEE <snowyu.lee@gmail.com>
Rich Dougherty <rich@rd.nz>
Richard Kiss <him@richardkiss.com> Richard Kiss <him@richardkiss.com>
Richard Roberson <richardr1126@gmail.com> Richard Roberson <richardr1126@gmail.com>
Rick G <26732651+TheFlipbook@users.noreply.github.com> Rick G <26732651+TheFlipbook@users.noreply.github.com>
@ -439,21 +549,30 @@ Robey Holderith <robey@flaminglunchbox.net>
Robyn <robyngraf@users.noreply.github.com> Robyn <robyngraf@users.noreply.github.com>
Roger Meier <r.meier@siemens.com> Roger Meier <r.meier@siemens.com>
Roland <14355895+rbur0425@users.noreply.github.com> Roland <14355895+rbur0425@users.noreply.github.com>
Romain Biessy <romain.biessy@codeplay.com>
Romain D <90720+Artefact2@users.noreply.github.com> Romain D <90720+Artefact2@users.noreply.github.com>
Romain Neutron <romain@neutron.io> Romain Neutron <romain@neutron.io>
Roman Parykin <donderom@gmail.com> Roman Parykin <donderom@gmail.com>
Ron Evans <ron@hybridgroup.com> Ron Evans <ron@hybridgroup.com>
Ron Jailall <rojailal@gmail.com> Ron Jailall <rojailal@gmail.com>
Roni <sulpher@gmx.net>
Ronny Brendel <ronnybrendel@gmail.com> Ronny Brendel <ronnybrendel@gmail.com>
Ronsor <ronsor@ronsor.pw> Ronsor <ronsor@ronsor.pw>
Rowan Hart <rowanbhart@gmail.com> Rowan Hart <rowanbhart@gmail.com>
Ruchira Hasaranga <ruchira66@gmail.com>
Ruixin Huang <18860020911@163.com>
Rune <43761327+Rune-AI@users.noreply.github.com> Rune <43761327+Rune-AI@users.noreply.github.com>
RunningLeon <maningsheng@sensetime.com>
RunningLeon <mnsheng@yeah.net>
Ryan Landay <rlanday@gmail.com> Ryan Landay <rlanday@gmail.com>
Ryder Wishart <ryderwishart@gmail.com> Ryder Wishart <ryderwishart@gmail.com>
Ryuei <louixs@users.noreply.github.com> Ryuei <louixs@users.noreply.github.com>
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
SXX <sxx1136965276@gmail.com>
SakuraUmi <yukinon244@gmail.com> SakuraUmi <yukinon244@gmail.com>
Salvador E. Tropea <stropea@inti.gob.ar> Salvador E. Tropea <stropea@inti.gob.ar>
Salvatore Mesoraca <s.mesoraca16@gmail.com>
Sam Spilsbury <smspillaz@gmail.com> Sam Spilsbury <smspillaz@gmail.com>
Sami Farin <3876865+Safari77@users.noreply.github.com> Sami Farin <3876865+Safari77@users.noreply.github.com>
Samuel Maynard <samwmaynard@gmail.com> Samuel Maynard <samwmaynard@gmail.com>
@ -463,23 +582,29 @@ Sebastián A <sebastian.aedo29@gmail.com>
SebastianApel <13675545+SebastianApel@users.noreply.github.com> SebastianApel <13675545+SebastianApel@users.noreply.github.com>
Senemu <10880819+Senemu@users.noreply.github.com> Senemu <10880819+Senemu@users.noreply.github.com>
Sergey Alirzaev <zl29ah@gmail.com> Sergey Alirzaev <zl29ah@gmail.com>
Sergio López <slp@redhat.com>
Sergio López <slp@sinrega.org> Sergio López <slp@sinrega.org>
Sertaç Özercan <852750+sozercan@users.noreply.github.com> Sertaç Özercan <852750+sozercan@users.noreply.github.com>
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
ShadovvBeast <ShadovvBeast@gmail.com> ShadovvBeast <ShadovvBeast@gmail.com>
Shakhar Dasgupta <shakhardasgupta@gmail.com> Shakhar Dasgupta <shakhardasgupta@gmail.com>
Shane A <shanea@allenai.org>
Shangning Xu <32517059+xushangning@users.noreply.github.com> Shangning Xu <32517059+xushangning@users.noreply.github.com>
Shankar <gshankar.87@gmail.com>
Shanshan Shen <467638484@qq.com>
Shijie <821898965@qq.com> Shijie <821898965@qq.com>
Shintarou Okada <kokuzen@gmail.com> Shintarou Okada <kokuzen@gmail.com>
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
Shouzheng Liu <lshzh.hi@gmail.com> Shouzheng Liu <lshzh.hi@gmail.com>
Shuichi Tsutsumi <shuichi0526@gmail.com> Shuichi Tsutsumi <shuichi0526@gmail.com>
Shupei Fan <dymarkfan@outlook.com>
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Simon Willison <swillison@gmail.com> Simon Willison <swillison@gmail.com>
Siwen Yu <yusiwen@gmail.com> Siwen Yu <yusiwen@gmail.com>
Sky Yan <skyan83@gmail.com> Sky Yan <skyan83@gmail.com>
Slaren <2141330+slaren@users.noreply.github.com> Slaren <2141330+slaren@users.noreply.github.com>
Slava Primenko <primenko.s@gmail.com> Slava Primenko <primenko.s@gmail.com>
Small Grass Forest <zixuanxcl@gmail.com>
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com> SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
Someone <sergei.kozlukov@aalto.fi> Someone <sergei.kozlukov@aalto.fi>
Someone Serge <sergei.kozlukov@aalto.fi> Someone Serge <sergei.kozlukov@aalto.fi>
@ -491,12 +616,15 @@ Stefan Sydow <stefan@sydow.email>
Steffen Röcker <sroecker@gmail.com> Steffen Röcker <sroecker@gmail.com>
Stephan Walter <stephan@walter.name> Stephan Walter <stephan@walter.name>
Stephen Nichols <snichols@users.noreply.github.com> Stephen Nichols <snichols@users.noreply.github.com>
Steve Bonds <sbonds@gmail.com>
Steve Grubb <ausearch.1@gmail.com> Steve Grubb <ausearch.1@gmail.com>
Steven Prichard <spprichard20@gmail.com> Steven Prichard <spprichard20@gmail.com>
Steven Roussey <sroussey@gmail.com> Steven Roussey <sroussey@gmail.com>
Steward Garcia <57494570+FSSRepo@users.noreply.github.com> Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
SuperUserNameMan <yoann@terminajones.com> SuperUserNameMan <yoann@terminajones.com>
Sutou Kouhei <kou@cozmixng.org>
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com> Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
Taikono-Himazin <kazu@po.harenet.ne.jp> Taikono-Himazin <kazu@po.harenet.ne.jp>
Tameem <113388789+AhmadTameem@users.noreply.github.com> Tameem <113388789+AhmadTameem@users.noreply.github.com>
@ -507,7 +635,9 @@ Theia Vogel <theia@vgel.me>
Thérence <13496987+Royalphax@users.noreply.github.com> Thérence <13496987+Royalphax@users.noreply.github.com>
Thibault Terrasson <thibault.terrasson@gmail.com> Thibault Terrasson <thibault.terrasson@gmail.com>
Thomas Klausner <wiz@gatalith.at> Thomas Klausner <wiz@gatalith.at>
Thorsten Sommer <SommerEngineering@users.noreply.github.com>
Tim Miller <drasticactions@users.noreply.github.com> Tim Miller <drasticactions@users.noreply.github.com>
Tim Wang <overocean@gmail.com>
Timmy Knight <r2d2fish@gmail.com> Timmy Knight <r2d2fish@gmail.com>
Timothy Cronin <40186632+4imothy@users.noreply.github.com> Timothy Cronin <40186632+4imothy@users.noreply.github.com>
Ting Lou <ting.lou@gmail.com> Ting Lou <ting.lou@gmail.com>
@ -517,24 +647,31 @@ Tom C <tom.corelis@gmail.com>
Tom Jobbins <784313+TheBloke@users.noreply.github.com> Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Tomas <tom.tomas.36478119@gmail.com> Tomas <tom.tomas.36478119@gmail.com>
Tomáš Pazdiora <tomas.pazdiora@gmail.com> Tomáš Pazdiora <tomas.pazdiora@gmail.com>
Tony Wasserka <4840017+neobrain@users.noreply.github.com>
Tristan Druyen <tristan@vault81.mozmail.com> Tristan Druyen <tristan@vault81.mozmail.com>
Tristan Ross <rosscomputerguy@protonmail.com> Tristan Ross <rosscomputerguy@protonmail.com>
Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
Tungsten842 <886724vf@anonaddy.me> Tungsten842 <886724vf@anonaddy.me>
Tungsten842 <quantmint@protonmail.com> Tungsten842 <quantmint@protonmail.com>
Tushar <ditsuke@protonmail.com> Tushar <ditsuke@protonmail.com>
UEXTM.com <84163508+uextm@users.noreply.github.com> UEXTM.com <84163508+uextm@users.noreply.github.com>
Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
Ulrich Drepper <drepper@gmail.com> Ulrich Drepper <drepper@gmail.com>
Uzo Nweke <uzoechi@gmail.com> Uzo Nweke <uzoechi@gmail.com>
Vaibhav Srivastav <vaibhavs10@gmail.com> Vaibhav Srivastav <vaibhavs10@gmail.com>
Val Kharitonov <mail@kharvd.com> Val Kharitonov <mail@kharvd.com>
Valentin Konovalov <valle.ketsujin@gmail.com> Valentin Konovalov <valle.ketsujin@gmail.com>
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
Vali Malinoiu <0x4139@gmail.com>
Victor Nogueira <felladrin@gmail.com> Victor Nogueira <felladrin@gmail.com>
Victor Z. Peng <ziliangdotme@gmail.com> Victor Z. Peng <ziliangdotme@gmail.com>
Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
Vlad <spitfireage@gmail.com> Vlad <spitfireage@gmail.com>
Vladimir <bogdad@gmail.com> Vladimir <bogdad@gmail.com>
Vladimir Malyutin <first-leon@yandex.ru> Vladimir Malyutin <first-leon@yandex.ru>
Vladimir Zorin <vladimir@deviant.guru> Vladimir Zorin <vladimir@deviant.guru>
VoidIsVoid <343750470@qq.com>
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
Weird Constructor <weirdconstructor@gmail.com> Weird Constructor <weirdconstructor@gmail.com>
@ -551,15 +688,22 @@ Xiang (Kevin) Li <kevinli020508@gmail.com>
Xiao-Yong Jin <jinxiaoyong@gmail.com> Xiao-Yong Jin <jinxiaoyong@gmail.com>
XiaotaoChen <chenxiaotao1234@gmail.com> XiaotaoChen <chenxiaotao1234@gmail.com>
Xiaoyi Chen <cxychina@gmail.com> Xiaoyi Chen <cxychina@gmail.com>
Xie Yanbo <xieyanbo@gmail.com>
Xingchen Song(宋星辰) <xingchensong1996@163.com> Xingchen Song(宋星辰) <xingchensong1996@163.com>
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
Xuan Son Nguyen <thichthat@gmail.com> Xuan Son Nguyen <thichthat@gmail.com>
Yaiko <elyaiko@hotmail.com>
Yann Follet <131855179+YannFollet@users.noreply.github.com> Yann Follet <131855179+YannFollet@users.noreply.github.com>
Yaroslav <yaroslav.yashin@me.com> Yaroslav <yaroslav.yashin@me.com>
Yazan Agha-Schrader <mountaiin@icloud.com> Yazan Agha-Schrader <mountaiin@icloud.com>
Yiming Cui <conandiy@vip.qq.com> Yiming Cui <conandiy@vip.qq.com>
Yishuo Wang <MeouSker77@outlook.com> Yishuo Wang <MeouSker77@outlook.com>
Yoshi Suhara <y.suhara@gmail.com>
Yoshi Suhara <ysuhara@nvidia.com>
Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
Yui <dev@sleepyyui.com> Yui <dev@sleepyyui.com>
Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com> Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
ZHAOKAI WANG <sanxianwei@163.com> ZHAOKAI WANG <sanxianwei@163.com>
@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com>
Zenix <zenixls2@gmail.com> Zenix <zenixls2@gmail.com>
Zhang Peiyuan <a1286225768@gmail.com> Zhang Peiyuan <a1286225768@gmail.com>
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Zhiyuan Li <lizhiyuan@uniartisan.com>
ZhouYuChen <zhouyuchen@naver.com> ZhouYuChen <zhouyuchen@naver.com>
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com> Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com>
alonfaraj <alonfaraj@gmail.com> alonfaraj <alonfaraj@gmail.com>
alwqx <kenan3015@gmail.com> alwqx <kenan3015@gmail.com>
amd-lalithnc <lalithnc@amd.com> amd-lalithnc <lalithnc@amd.com>
amritahs-ibm <amritahs@linux.vnet.ibm.com>
andrijdavid <david@geek.mg> andrijdavid <david@geek.mg>
anon998 <131767832+anon998@users.noreply.github.com> anon998 <131767832+anon998@users.noreply.github.com>
anzz1 <anzz1@live.com> anzz1 <anzz1@live.com>
@ -588,14 +735,18 @@ apaz <aarpazdera@gmail.com>
apcameron <37645737+apcameron@users.noreply.github.com> apcameron <37645737+apcameron@users.noreply.github.com>
arch-btw <57669023+arch-btw@users.noreply.github.com> arch-btw <57669023+arch-btw@users.noreply.github.com>
arcrank <arcrank@gmail.com> arcrank <arcrank@gmail.com>
ardfork <134447697+ardfork@users.noreply.github.com>
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
at8u <129688334+at8u@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com>
automaticcat <daogiatuank54@gmail.com> automaticcat <daogiatuank54@gmail.com>
awatuna <23447591+awatuna@users.noreply.github.com>
b4b4o <zwbao@foxmail.com>
bandoti <141645996+bandoti@users.noreply.github.com> bandoti <141645996+bandoti@users.noreply.github.com>
beiller <beiller@gmail.com> beiller <beiller@gmail.com>
bhubbb <79117352+bhubbb@users.noreply.github.com> bhubbb <79117352+bhubbb@users.noreply.github.com>
bmwl <brian.marshall@tolko.com> bmwl <brian.marshall@tolko.com>
bobqianic <129547291+bobqianic@users.noreply.github.com> bobqianic <129547291+bobqianic@users.noreply.github.com>
brucepro <git@brucepro.net>
bryanSwk <93190252+bryanSwk@users.noreply.github.com> bryanSwk <93190252+bryanSwk@users.noreply.github.com>
bsilvereagle <bsilvereagle@users.noreply.github.com> bsilvereagle <bsilvereagle@users.noreply.github.com>
bssrdf <merlintiger@hotmail.com> bssrdf <merlintiger@hotmail.com>
@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com>
crasm <crasm@git.vczf.net> crasm <crasm@git.vczf.net>
crasm <crasm@git.vczf.us> crasm <crasm@git.vczf.us>
daboe01 <daboe01@googlemail.com> daboe01 <daboe01@googlemail.com>
daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
daminho <37615795+daminho@users.noreply.github.com>
david raistrick <keen99@users.noreply.github.com> david raistrick <keen99@users.noreply.github.com>
ddh0 <dylanhalladay02@icloud.com> ddh0 <dylanhalladay02@icloud.com>
ddpasa <112642920+ddpasa@users.noreply.github.com> ddpasa <112642920+ddpasa@users.noreply.github.com>
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
devojony <61173062+devojony@users.noreply.github.com>
ditsuke <ditsuke@protonmail.com>
divinity76 <divinity76@gmail.com> divinity76 <divinity76@gmail.com>
dm4 <sunrisedm4@gmail.com> dm4 <sunrisedm4@gmail.com>
dotpy314 <33351922+dotpy314@users.noreply.github.com> dotpy314 <33351922+dotpy314@users.noreply.github.com>
@ -629,14 +784,18 @@ ebraminio <ebraminio@gmail.com>
eiery <19350831+eiery@users.noreply.github.com> eiery <19350831+eiery@users.noreply.github.com>
eric8607242 <e0928021388@gmail.com> eric8607242 <e0928021388@gmail.com>
fairydreaming <166155368+fairydreaming@users.noreply.github.com> fairydreaming <166155368+fairydreaming@users.noreply.github.com>
fengerhu1 <2748250768@qq.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com>
github-actions[bot] <github-actions[bot]@users.noreply.github.com> github-actions[bot] <github-actions[bot]@users.noreply.github.com>
gliptic <gliptic@users.noreply.github.com> gliptic <gliptic@users.noreply.github.com>
goerch <jhr.walter@t-online.de> goerch <jhr.walter@t-online.de>
grahameth <96447521+grahameth@users.noreply.github.com> grahameth <96447521+grahameth@users.noreply.github.com>
gtygo <gtydoit@gmail.com>
gwjr <502526+gwjr@users.noreply.github.com> gwjr <502526+gwjr@users.noreply.github.com>
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
hankcs <cnhankmc@gmail.com> hankcs <cnhankmc@gmail.com>
haopeng <657407891@qq.com>
hipudding <huafengchun@gmail.com>
hoangmit <hoangmit@users.noreply.github.com> hoangmit <hoangmit@users.noreply.github.com>
hongbo.mo <352280764@qq.com> hongbo.mo <352280764@qq.com>
hopkins385 <98618192+hopkins385@users.noreply.github.com> hopkins385 <98618192+hopkins385@users.noreply.github.com>
@ -649,12 +808,14 @@ hxer7963 <hxer7963@gmail.com>
hydai <z54981220@gmail.com> hydai <z54981220@gmail.com>
iSma <ismail.senhaji@gmail.com> iSma <ismail.senhaji@gmail.com>
iacore <74560659+iacore@users.noreply.github.com> iacore <74560659+iacore@users.noreply.github.com>
icppWorld <124377669+icppWorld@users.noreply.github.com>
igarnier <igarnier@protonmail.com> igarnier <igarnier@protonmail.com>
intelmatt <61025942+intelmatt@users.noreply.github.com> intelmatt <61025942+intelmatt@users.noreply.github.com>
iohub <rickyang.pro@gmail.com> iohub <rickyang.pro@gmail.com>
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
jameswu2014 <545426914@qq.com> jameswu2014 <545426914@qq.com>
jdomke <28772296+jdomke@users.noreply.github.com>
jiez <373447296@qq.com> jiez <373447296@qq.com>
jneem <joeneeman@gmail.com> jneem <joeneeman@gmail.com>
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com>
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
kunnis <kunnis@users.noreply.github.com> kunnis <kunnis@users.noreply.github.com>
kuronekosaiko <EvanChanJ@163.com> kuronekosaiko <EvanChanJ@163.com>
kustaaya <58045274+kustaaya@users.noreply.github.com>
kuvaus <22169537+kuvaus@users.noreply.github.com> kuvaus <22169537+kuvaus@users.noreply.github.com>
kwin1412 <42286931+kwin1412@users.noreply.github.com> kwin1412 <42286931+kwin1412@users.noreply.github.com>
l3utterfly <gc.pthzfoldr@gmail.com> l3utterfly <gc.pthzfoldr@gmail.com>
laik <laik.lj@me.com>
ldwang <ftgreat@163.com> ldwang <ftgreat@163.com>
le.chang <cljs118@126.com> le.chang <cljs118@126.com>
leejet <leejet714@gmail.com> leejet <leejet714@gmail.com>
leo-pony <nengjunma@outlook.com>
limitedAtonement <limitedAtonement@users.noreply.github.com> limitedAtonement <limitedAtonement@users.noreply.github.com>
liuwei-git <14815172+liuwei-git@users.noreply.github.com> liuwei-git <14815172+liuwei-git@users.noreply.github.com>
lon <114724657+longregen@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com>
loonerin <132926317+loonerin@users.noreply.github.com> loonerin <132926317+loonerin@users.noreply.github.com>
ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
luoyu-intel <yu.luo@intel.com> luoyu-intel <yu.luo@intel.com>
m3ndax <adrian.goessl@outlook.com> m3ndax <adrian.goessl@outlook.com>
maddes8cht <55592906+maddes8cht@users.noreply.github.com> maddes8cht <55592906+maddes8cht@users.noreply.github.com>
makomk <makosoft@googlemail.com> makomk <makosoft@googlemail.com>
manikbhandari <mbbhandarimanik2@gmail.com> manikbhandari <mbbhandarimanik2@gmail.com>
maor-ps <154728172+maor-ps@users.noreply.github.com> maor-ps <154728172+maor-ps@users.noreply.github.com>
matiaslin <45382001+matiaslin@users.noreply.github.com>
matteo <matteogeniaccio@yahoo.it>
mdrokz <mohammadmunshi@gmail.com> mdrokz <mohammadmunshi@gmail.com>
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
minarchist <minarchist@users.noreply.github.com> minarchist <minarchist@users.noreply.github.com>
mj-shifu <77107165+mj-shifu@users.noreply.github.com> mj-shifu <77107165+mj-shifu@users.noreply.github.com>
mmyjona <jonathan.gonse@gmail.com> mmyjona <jonathan.gonse@gmail.com>
momonga <115213907+mmnga@users.noreply.github.com> momonga <115213907+mmnga@users.noreply.github.com>
momonga <146910567+mmngays@users.noreply.github.com>
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
mzcu <milos.cubrilo@gmail.com> mzcu <milos.cubrilo@gmail.com>
nanahi <130121847+na-na-hi@users.noreply.github.com> nanahi <130121847+na-na-hi@users.noreply.github.com>
@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com>
oobabooga <112222186+oobabooga@users.noreply.github.com> oobabooga <112222186+oobabooga@users.noreply.github.com>
opparco <parco.opaai@gmail.com> opparco <parco.opaai@gmail.com>
ostix360 <55257054+ostix360@users.noreply.github.com> ostix360 <55257054+ostix360@users.noreply.github.com>
pculliton <phillipculliton@gmail.com>
pengxin99 <pengxin.yuan@intel.com> pengxin99 <pengxin.yuan@intel.com>
perserk <perserk@gmail.com> perserk <perserk@gmail.com>
piDack <104877312+piDack@users.noreply.github.com>
pmysl <piotr.myslinski@outlook.com> pmysl <piotr.myslinski@outlook.com>
postmasters <namnguyen@google.com> postmasters <namnguyen@google.com>
pudepiedj <pudepiedj@gmail.com> pudepiedj <pudepiedj@gmail.com>
@ -733,6 +903,7 @@ runfuture <runfuture@users.noreply.github.com>
sandyiscool <sandyiscool@gmail.com> sandyiscool <sandyiscool@gmail.com>
sasha0552 <admin@sasha0552.org> sasha0552 <admin@sasha0552.org>
semidark <me@semidark.net> semidark <me@semidark.net>
serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
sharpHL <132747147+sharpHL@users.noreply.github.com> sharpHL <132747147+sharpHL@users.noreply.github.com>
shibe2 <shibe@tuta.io> shibe2 <shibe@tuta.io>
singularity <12184989+singularity-s0@users.noreply.github.com> singularity <12184989+singularity-s0@users.noreply.github.com>
@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
slaren <2141330+slaren@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com>
slaren <slarengh@gmail.com> slaren <slarengh@gmail.com>
snadampal <87143774+snadampal@users.noreply.github.com> snadampal <87143774+snadampal@users.noreply.github.com>
standby24x7 <standby24x7@gmail.com>
staviq <staviq@gmail.com> staviq <staviq@gmail.com>
stduhpf <stephduh@live.fr> stduhpf <stephduh@live.fr>
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com> strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
swittk <switt1995@gmail.com> swittk <switt1995@gmail.com>
takov751 <40316768+takov751@users.noreply.github.com> takov751 <40316768+takov751@users.noreply.github.com>
tarcey <cey.tarik@gmail.com> tarcey <cey.tarik@gmail.com>
tc-mb <157115220+tc-mb@users.noreply.github.com>
texmex76 <40733439+texmex76@users.noreply.github.com> texmex76 <40733439+texmex76@users.noreply.github.com>
thement <40525767+thement@users.noreply.github.com> thement <40525767+thement@users.noreply.github.com>
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
tjohnman <tjohnman@users.noreply.github.com> tjohnman <tjohnman@users.noreply.github.com>
toyer <2042519524@qq.com>
tslmy <tslmy@users.noreply.github.com> tslmy <tslmy@users.noreply.github.com>
ubik2 <ubik2@users.noreply.github.com> ubik2 <ubik2@users.noreply.github.com>
uint256_t <konndennsa@gmail.com> uint256_t <konndennsa@gmail.com>
uint256_t <maekawatoshiki1017@gmail.com> uint256_t <maekawatoshiki1017@gmail.com>
unbounded <haakon@likedan.net> unbounded <haakon@likedan.net>
uvos <devnull@uvos.xyz>
valiray <133289098+valiray@users.noreply.github.com> valiray <133289098+valiray@users.noreply.github.com>
vb <vaibhavs10@gmail.com>
vik <vikhyatk@gmail.com> vik <vikhyatk@gmail.com>
viric <viric@viric.name> viric <viric@viric.name>
vodkaslime <646329483@qq.com> vodkaslime <646329483@qq.com>
vvhg1 <94630311+vvhg1@users.noreply.github.com> vvhg1 <94630311+vvhg1@users.noreply.github.com>
vxiiduu <73044267+vxiiduu@users.noreply.github.com> vxiiduu <73044267+vxiiduu@users.noreply.github.com>
wangshuai09 <391746016@qq.com>
wbpxre150 <100937007+wbpxre150@users.noreply.github.com> wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
whoreson <139810751+whoreson@users.noreply.github.com> whoreson <139810751+whoreson@users.noreply.github.com>
woachk <24752637+woachk@users.noreply.github.com> woachk <24752637+woachk@users.noreply.github.com>
wonjun Jang <strutive07@gmail.com> wonjun Jang <strutive07@gmail.com>
woodx <124784234+woodx9@users.noreply.github.com> woodx <124784234+woodx9@users.noreply.github.com>
wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
wzy <32936898+Freed-Wu@users.noreply.github.com> wzy <32936898+Freed-Wu@users.noreply.github.com>
xaedes <xaedes@gmail.com> xaedes <xaedes@gmail.com>
xaedes <xaedes@googlemail.com> xaedes <xaedes@googlemail.com>
xctan <axunlei@gmail.com>
xloem <0xloem@gmail.com> xloem <0xloem@gmail.com>
yangli2 <yangli2@gmail.com> yangli2 <yangli2@gmail.com>
yuiseki <yuiseki@gmail.com> yuiseki <yuiseki@gmail.com>
yuri@FreeBSD <yurivict@users.noreply.github.com>
zakkor <edward.partenie@gmail.com> zakkor <edward.partenie@gmail.com>
zhangkaihuo <zhangkaihuo@gmail.com> zhangkaihuo <zhangkaihuo@gmail.com>
zhentaoyu <zhentao.yu@intel.com>
zhouwg <6889919+zhouwg@users.noreply.github.com> zhouwg <6889919+zhouwg@users.noreply.github.com>
zhouwg <zhouwg2000@gmail.com> zhouwg <zhouwg2000@gmail.com>
zrm <trustiosity.zrm@gmail.com> zrm <trustiosity.zrm@gmail.com>
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com> Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
杨朱 · Kiki <baofa.fan@daocloud.io>
源文雨 <41315874+fumiama@users.noreply.github.com> 源文雨 <41315874+fumiama@users.noreply.github.com>
蕭澧邦 <45505768+shou692199@users.noreply.github.com>
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>

View file

@ -46,6 +46,11 @@ if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
endif() endif()
if (MSVC)
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
endif()
# #
# option list # option list
# #
@ -75,6 +80,7 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
# override ggml options # override ggml options
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
@ -88,10 +94,6 @@ if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON) set(GGML_LLAMAFILE_DEFAULT ON)
endif() endif()
if (NOT DEFINED GGML_AMX)
set(GGML_AMX ON)
endif()
if (NOT DEFINED GGML_CUDA_GRAPHS) if (NOT DEFINED GGML_CUDA_GRAPHS)
set(GGML_CUDA_GRAPHS_DEFAULT ON) set(GGML_CUDA_GRAPHS_DEFAULT ON)
endif() endif()
@ -140,7 +142,6 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
# At the moment some compile definitions are placed within the ggml/src # At the moment some compile definitions are placed within the ggml/src
# directory but not exported on the `ggml` target. This could be improved by # directory but not exported on the `ggml` target. This could be improved by
# determining _precisely_ which defines are necessary for the llama-config # determining _precisely_ which defines are necessary for the llama-config
@ -157,8 +158,11 @@ if (GGML_TARGET_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
endif() endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
# all public headers
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) set(LLAMA_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
install(TARGETS llama LIBRARY PUBLIC_HEADER) install(TARGETS llama LIBRARY PUBLIC_HEADER)
configure_package_config_file( configure_package_config_file(

View file

@ -24,11 +24,19 @@
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
} }
}, },
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
{
"name": "x64-windows-llvm", "hidden": true,
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
}
},
{ {
"name": "arm64-windows-msvc", "hidden": true, "name": "arm64-windows-msvc", "hidden": true,
@ -57,25 +65,33 @@
} }
}, },
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] }, { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] }, { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
{ "name": "arm64-apple-clang-debug" , "inherits": [ "base", "arm64-apple-clang", "debug" ] }, { "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
{ "name": "arm64-apple-clang-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg" ] }, { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
{ "name": "arm64-apple-clang+static-release" , "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] }, { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] }, { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] }, { "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] }, { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }, { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] } { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
{ "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
{ "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
] ]
} }

11
CODEOWNERS Normal file
View file

@ -0,0 +1,11 @@
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
/ci/ @ggerganov
/.devops/*.Dockerfile @ngxson
/examples/server/ @ngxson
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler

View file

@ -1,9 +1,10 @@
# Pull requests (for contributors) # Pull requests (for contributors)
- Test your changes: - Test your changes:
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library - Execute [the full CI locally on your machine](ci/README.md) before publishing
- Execute [the full CI locally on your machine](ci/README.md) before publishing - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
@ -12,20 +13,111 @@
- Squash-merge PRs - Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)` - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules - Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
# Coding guidelines # Coding guidelines
- Avoid adding third-party dependencies, extra files, extra headers, etc. - Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures - Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple - Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit - Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) - Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
- In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
```cpp
// OK
llama_context * ctx;
const llama_rope_type rope_type;
// not OK
struct llama_context * ctx;
const enum llama_rope_type rope_type;
```
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png) ![matmul](media/matmul.png)
# Naming guidelines
- Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
```cpp
// not OK
int small_number;
int big_number;
// OK
int number_small;
int number_big;
```
- Enum values are always in upper case and prefixed with the enum name
```cpp
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_NONE = 0,
LLAMA_VOCAB_TYPE_SPM = 1,
LLAMA_VOCAB_TYPE_BPE = 2,
LLAMA_VOCAB_TYPE_WPM = 3,
LLAMA_VOCAB_TYPE_UGM = 4,
LLAMA_VOCAB_TYPE_RWKV = 5,
};
```
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
```cpp
llama_model_init(); // class: "llama_model", method: "init"
llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed"
llama_set_embeddings(); // class: "llama_context", method: "set_embeddings"
llama_n_threads(); // class: "llama_context", method: "n_threads"
llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free"
```
- The `get` `<action>` can be omitted
- The `<noun>` can be omitted if not necessary
- The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
- Use `init`/`free` for constructor/destructor `<action>`
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
```cpp
typedef struct llama_context * llama_context_t;
enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
```
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
- Python filenames are all lowercase with underscores
- _(TODO: abbreviations usage)_
# Preprocessor directives
- _(TODO: add guidelines with examples and apply them to the codebase)_
```cpp
#ifdef FOO
#endif // FOO
```
# Documentation
- Documentation is a community effort
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
- When you notice incorrect or outdated documentation, please update it
# Resources # Resources
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:

647
Makefile
View file

@ -1,3 +1,7 @@
ifndef LLAMA_MAKEFILE
$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
endif
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = \ BUILD_TARGETS = \
libllava.a \ libllava.a \
@ -18,6 +22,7 @@ BUILD_TARGETS = \
llama-infill \ llama-infill \
llama-llava-cli \ llama-llava-cli \
llama-minicpmv-cli\ llama-minicpmv-cli\
llama-qwen2vl-cli\
llama-lookahead \ llama-lookahead \
llama-lookup \ llama-lookup \
llama-lookup-create \ llama-lookup-create \
@ -34,6 +39,7 @@ BUILD_TARGETS = \
llama-server \ llama-server \
llama-simple \ llama-simple \
llama-simple-chat \ llama-simple-chat \
llama-run \
llama-speculative \ llama-speculative \
llama-tokenize \ llama-tokenize \
llama-vdot \ llama-vdot \
@ -48,7 +54,6 @@ TEST_TARGETS = \
tests/test-backend-ops \ tests/test-backend-ops \
tests/test-chat-template \ tests/test-chat-template \
tests/test-double-float \ tests/test-double-float \
tests/test-grad0 \
tests/test-grammar-integration \ tests/test-grammar-integration \
tests/test-grammar-parser \ tests/test-grammar-parser \
tests/test-json-schema-to-grammar \ tests/test-json-schema-to-grammar \
@ -251,11 +256,11 @@ endif
# Compile flags # Compile flags
# #
# keep standard at C11 and C++11 # keep standard at C11 and C++17
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
MK_CFLAGS = -std=c11 -fPIC MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC MK_CXXFLAGS = -std=c++17 -fPIC
MK_NVCCFLAGS = -std=c++11 MK_NVCCFLAGS = -std=c++17
ifdef LLAMA_NO_CCACHE ifdef LLAMA_NO_CCACHE
GGML_NO_CCACHE := 1 GGML_NO_CCACHE := 1
@ -291,6 +296,7 @@ endif
# some memory allocation are available on Linux through GNU extensions in libc # some memory allocation are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux) ifeq ($(UNAME_S),Linux)
MK_CPPFLAGS += -D_GNU_SOURCE MK_CPPFLAGS += -D_GNU_SOURCE
MK_LDFLAGS += -ldl
endif endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
@ -359,6 +365,10 @@ ifdef LLAMA_SERVER_SSL
MK_LDFLAGS += -lssl -lcrypto MK_LDFLAGS += -lssl -lcrypto
endif endif
ifndef GGML_NO_CPU_AARCH64
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
endif
# warnings # warnings
WARN_FLAGS = \ WARN_FLAGS = \
-Wall \ -Wall \
@ -436,6 +446,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
MK_CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
HOST_CXXFLAGS += -march=native -mtune=native HOST_CXXFLAGS += -march=native -mtune=native
# Usage AMX build test
#MK_CFLAGS += -march=graniterapids -mtune=graniterapids
#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
# Usage AVX-only # Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
#MK_CXXFLAGS += -mfma -mf16c -mavx #MK_CXXFLAGS += -mfma -mf16c -mavx
@ -523,70 +537,62 @@ ifndef GGML_NO_ACCELERATE
# Mac OS - include Accelerate framework. # Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel # `-framework Accelerate` works both with Apple Silicon and Mac Intel
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
MK_LDFLAGS += -framework Accelerate MK_LDFLAGS += -framework Accelerate
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
endif endif
endif # GGML_NO_ACCELERATE endif # GGML_NO_ACCELERATE
ifdef GGML_MUSA
CC := clang
CXX := clang++
GGML_CUDA := 1
MK_CPPFLAGS += -DGGML_USE_MUSA
endif
ifndef GGML_NO_OPENMP ifndef GGML_NO_OPENMP
MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP
MK_CFLAGS += -fopenmp MK_CFLAGS += -fopenmp
MK_CXXFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp
ifdef GGML_MUSA
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
endif # GGML_MUSA
endif # GGML_NO_OPENMP endif # GGML_NO_OPENMP
ifdef GGML_OPENBLAS ifdef GGML_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
MK_LDFLAGS += $(shell pkg-config --libs openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas)
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
endif # GGML_OPENBLAS endif # GGML_OPENBLAS
ifdef GGML_OPENBLAS64 ifdef GGML_OPENBLAS64
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
MK_LDFLAGS += $(shell pkg-config --libs openblas64) MK_LDFLAGS += $(shell pkg-config --libs openblas64)
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
endif # GGML_OPENBLAS64 endif # GGML_OPENBLAS64
ifdef GGML_BLIS ifdef GGML_BLIS
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib MK_LDFLAGS += -lblis -L/usr/local/lib
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
endif # GGML_BLIS endif # GGML_BLIS
ifdef GGML_NVPL ifdef GGML_NVPL
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
endif # GGML_NVPL endif # GGML_NVPL
ifndef GGML_NO_LLAMAFILE ifndef GGML_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJ_GGML += ggml/src/llamafile/sgemm.o OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
endif endif
ifndef GGML_NO_AMX ifndef GGML_NO_AMX
MK_CPPFLAGS += -DGGML_USE_AMX MK_CPPFLAGS += -DGGML_USE_AMX
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
endif endif
# only necessary for the CPU backend files
MK_CPPFLAGS += -Iggml/src/ggml-cpu
ifdef GGML_RPC ifdef GGML_RPC
MK_CPPFLAGS += -DGGML_USE_RPC MK_CPPFLAGS += -DGGML_USE_RPC
OBJ_GGML += ggml/src/ggml-rpc.o OBJ_GGML_EXT += ggml/src/ggml-rpc.o
endif # GGML_RPC endif # GGML_RPC
OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu)) OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
@ -601,41 +607,27 @@ else
endif # GGML_CUDA_FA_ALL_QUANTS endif # GGML_CUDA_FA_ALL_QUANTS
ifdef GGML_CUDA ifdef GGML_CUDA
ifdef GGML_MUSA ifneq ('', '$(wildcard /opt/cuda)')
ifneq ('', '$(wildcard /opt/musa)') CUDA_PATH ?= /opt/cuda
CUDA_PATH ?= /opt/musa
else
CUDA_PATH ?= /usr/local/musa
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
else else
ifneq ('', '$(wildcard /opt/cuda)') CUDA_PATH ?= /usr/local/cuda
CUDA_PATH ?= /opt/cuda endif
else
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math MK_NVCCFLAGS += -use_fast_math
endif # GGML_MUSA
OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML += $(OBJ_CUDA_TMPL) OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
ifdef LLAMA_FATAL_WARNINGS ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings MK_NVCCFLAGS += -Werror all-warnings
endif # LLAMA_FATAL_WARNINGS endif # LLAMA_FATAL_WARNINGS
ifndef GGML_MUSA
ifndef JETSON_EOL_MODULE_DETECT ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
endif # GGML_MUSA
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo MK_NVCCFLAGS += -lineinfo
@ -648,11 +640,7 @@ endif # GGML_CUDA_DEBUG
ifdef GGML_CUDA_NVCC ifdef GGML_CUDA_NVCC
NVCC = $(CCACHE) $(GGML_CUDA_NVCC) NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
else else
ifdef GGML_MUSA NVCC = $(CCACHE) nvcc
NVCC = $(CCACHE) mcc
else
NVCC = $(CCACHE) nvcc
endif # GGML_MUSA
endif # GGML_CUDA_NVCC endif # GGML_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
@ -661,10 +649,6 @@ else ifndef CUDA_POWER_ARCH
MK_NVCCFLAGS += -arch=native MK_NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH endif # CUDA_DOCKER_ARCH
ifdef GGML_CUDA_FORCE_DMMV
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # GGML_CUDA_FORCE_DMMV
ifdef GGML_CUDA_FORCE_MMQ ifdef GGML_CUDA_FORCE_MMQ
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # GGML_CUDA_FORCE_MMQ endif # GGML_CUDA_FORCE_MMQ
@ -673,20 +657,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
endif # GGML_CUDA_FORCE_CUBLAS endif # GGML_CUDA_FORCE_CUBLAS
ifdef GGML_CUDA_DMMV_X
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
else
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
endif # GGML_CUDA_DMMV_X
ifdef GGML_CUDA_MMV_Y
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
else ifdef GGML_CUDA_DMMV_Y
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
else
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
endif # GGML_CUDA_MMV_Y
ifdef GGML_CUDA_F16 ifdef GGML_CUDA_F16
MK_NVCCFLAGS += -DGGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_F16 endif # GGML_CUDA_F16
@ -695,12 +665,6 @@ ifdef GGML_CUDA_DMMV_F16
MK_NVCCFLAGS += -DGGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_DMMV_F16 endif # GGML_CUDA_DMMV_F16
ifdef GGML_CUDA_KQUANTS_ITER
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
else
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
else else
@ -724,15 +688,9 @@ define NVCC_COMPILE
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE endef # NVCC_COMPILE
else else
ifdef GGML_MUSA
define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
endef # NVCC_COMPILE
else
define NVCC_COMPILE define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE endef # NVCC_COMPILE
endif # GGML_MUSA
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-cuda/%.o: \
@ -742,8 +700,8 @@ ggml/src/ggml-cuda/%.o: \
ggml/src/ggml-cuda/common.cuh ggml/src/ggml-cuda/common.cuh
$(NVCC_COMPILE) $(NVCC_COMPILE)
ggml/src/ggml-cuda.o: \ ggml/src/ggml-cuda/ggml-cuda.o: \
ggml/src/ggml-cuda.cu \ ggml/src/ggml-cuda/ggml-cuda.cu \
ggml/include/ggml-cuda.h \ ggml/include/ggml-cuda.h \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/include/ggml-backend.h \ ggml/include/ggml-backend.h \
@ -754,9 +712,9 @@ ggml/src/ggml-cuda.o: \
endif # GGML_CUDA endif # GGML_CUDA
ifdef GGML_VULKAN ifdef GGML_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += $(shell pkg-config --libs vulkan) MK_LDFLAGS += $(shell pkg-config --libs vulkan)
OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
ifdef GGML_VULKAN_CHECK_RESULTS ifdef GGML_VULKAN_CHECK_RESULTS
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
@ -786,10 +744,10 @@ GLSLC_CMD = glslc
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen _ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp _ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp _ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
_ggml_vk_input_dir = ggml/src/vulkan-shaders _ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp) _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@ $(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
$(_ggml_vk_header): $(_ggml_vk_source) $(_ggml_vk_header): $(_ggml_vk_source)
@ -801,12 +759,12 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
--target-hpp $(_ggml_vk_header) \ --target-hpp $(_ggml_vk_header) \
--target-cpp $(_ggml_vk_source) --target-cpp $(_ggml_vk_source)
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
endif # GGML_VULKAN endif # GGML_VULKAN
ifdef GGML_HIPBLAS ifdef GGML_HIP
ifeq ($(wildcard /opt/rocm),) ifeq ($(wildcard /opt/rocm),)
ROCM_PATH ?= /usr ROCM_PATH ?= /usr
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@ -815,11 +773,7 @@ ifdef GGML_HIPBLAS
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif endif
GGML_CUDA_DMMV_X ?= 32 MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
GGML_CUDA_MMV_Y ?= 1
GGML_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
ifdef GGML_HIP_UMA ifdef GGML_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA
@ -832,13 +786,6 @@ endif # GGML_HIP_UMA
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
ifdef GGML_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # GGML_CUDA_FORCE_DMMV
ifdef GGML_CUDA_FORCE_MMQ ifdef GGML_CUDA_FORCE_MMQ
HIPFLAGS += -DGGML_CUDA_FORCE_MMQ HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
@ -852,12 +799,12 @@ ifdef GGML_CUDA_NO_PEER_COPY
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # GGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY
OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML += $(OBJ_CUDA_TMPL) OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
ggml/src/ggml-cuda.o: \ ggml/src/ggml-cuda/ggml-cuda.o: \
ggml/src/ggml-cuda.cu \ ggml/src/ggml-cuda/ggml-cuda.cu \
ggml/include/ggml-cuda.h \ ggml/include/ggml-cuda.h \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/include/ggml-backend.h \ ggml/include/ggml-backend.h \
@ -872,72 +819,172 @@ ggml/src/ggml-cuda/%.o: \
ggml/src/ggml-common.h \ ggml/src/ggml-common.h \
ggml/src/ggml-cuda/common.cuh ggml/src/ggml-cuda/common.cuh
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # GGML_HIPBLAS endif # GGML_HIP
ifdef GGML_MUSA
ifeq ($(wildcard /opt/musa),)
MUSA_PATH ?= /usr/local/musa
else
MUSA_PATH ?= /opt/musa
endif
MUSA_ARCHITECTURES ?= 21;22
MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
MK_LDFLAGS += -lmusa -lmusart -lmublas
ifndef GGML_NO_OPENMP
# For Ubuntu Focal
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
# For Ubuntu Jammy
MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
MK_LDFLAGS += -L/usr/lib/llvm-14/lib
endif # GGML_NO_OPENMP
CC := $(MUSA_PATH)/bin/clang
CXX := $(MUSA_PATH)/bin/clang++
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
MUSAFLAGS = -x musa -mtgpu
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
ifdef GGML_CUDA_FORCE_MMQ
MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # GGML_CUDA_FORCE_MMQ
ifdef GGML_CUDA_FORCE_CUBLAS
MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
endif # GGML_CUDA_FORCE_CUBLAS
ifdef GGML_CUDA_F16
MUSAFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_F16
ifdef GGML_CUDA_DMMV_F16
MUSAFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_DMMV_F16
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
else
MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
ifdef GGML_CUDA_NO_PEER_COPY
MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # GGML_CUDA_NO_PEER_COPY
ifdef GGML_CUDA_FA_ALL_QUANTS
MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
endif # GGML_CUDA_FA_ALL_QUANTS
OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
ggml/src/ggml-cuda/ggml-cuda.o: \
ggml/src/ggml-cuda/ggml-cuda.cu \
ggml/include/ggml-cuda.h \
ggml/include/ggml.h \
ggml/include/ggml-backend.h \
ggml/src/ggml-backend-impl.h \
ggml/src/ggml-common.h \
$(wildcard ggml/src/ggml-cuda/*.cuh)
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
ggml/src/ggml-cuda/%.o: \
ggml/src/ggml-cuda/%.cu \
ggml/include/ggml.h \
ggml/src/ggml-common.h \
ggml/src/ggml-cuda/common.cuh
$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
endif # GGML_MUSA
ifdef GGML_METAL ifdef GGML_METAL
MK_CPPFLAGS += -DGGML_USE_METAL MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJ_GGML += ggml/src/ggml-metal.o OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o
ifdef GGML_METAL_USE_BF16
MK_CPPFLAGS += -DGGML_METAL_USE_BF16
endif # GGML_METAL_USE_BF16
ifdef GGML_METAL_NDEBUG ifdef GGML_METAL_NDEBUG
MK_CPPFLAGS += -DGGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG
endif endif
ifdef GGML_METAL_EMBED_LIBRARY ifdef GGML_METAL_EMBED_LIBRARY
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
OBJ_GGML += ggml/src/ggml-metal-embed.o OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o
endif endif
endif # GGML_METAL endif # GGML_METAL
ifdef GGML_METAL ifdef GGML_METAL
ggml/src/ggml-metal.o: \ ggml/src/ggml-metal/ggml-metal.o: \
ggml/src/ggml-metal.m \ ggml/src/ggml-metal/ggml-metal.m \
ggml/src/ggml-metal/ggml-metal-impl.h \
ggml/include/ggml-metal.h \ ggml/include/ggml-metal.h \
ggml/include/ggml.h ggml/include/ggml.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ifdef GGML_METAL_EMBED_LIBRARY ifdef GGML_METAL_EMBED_LIBRARY
ggml/src/ggml-metal-embed.o: \ ggml/src/ggml-metal-embed.o: \
ggml/src/ggml-metal.metal \ ggml/src/ggml-metal/ggml-metal.metal \
ggml/src/ggml-metal/ggml-metal-impl.h \
ggml/src/ggml-common.h ggml/src/ggml-common.h
@echo "Embedding Metal library" @echo "Embedding Metal library"
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp -d)) $(eval TEMP_ASSEMBLY=$(shell mktemp -d))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
@rmdir ${TEMP_ASSEMBLY} @rmdir ${TEMP_ASSEMBLY}
endif endif
endif # GGML_METAL endif # GGML_METAL
OBJ_GGML += \ DIR_GGML = ggml
ggml/src/ggml.o \ DIR_LLAMA = src
ggml/src/ggml-cpu.o \ DIR_COMMON = common
ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \ OBJ_GGML = \
ggml/src/ggml-quants.o \ $(DIR_GGML)/src/ggml.o \
ggml/src/ggml-aarch64.o $(DIR_GGML)/src/ggml-alloc.o \
$(DIR_GGML)/src/ggml-backend.o \
$(DIR_GGML)/src/ggml-backend-reg.o \
$(DIR_GGML)/src/ggml-opt.o \
$(DIR_GGML)/src/ggml-quants.o \
$(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
$(OBJ_GGML_EXT)
OBJ_LLAMA = \ OBJ_LLAMA = \
src/llama.o \ $(DIR_LLAMA)/llama.o \
src/llama-vocab.o \ $(DIR_LLAMA)/llama-vocab.o \
src/llama-grammar.o \ $(DIR_LLAMA)/llama-grammar.o \
src/llama-sampling.o \ $(DIR_LLAMA)/llama-sampling.o \
src/unicode.o \ $(DIR_LLAMA)/unicode.o \
src/unicode-data.o $(DIR_LLAMA)/unicode-data.o
OBJ_COMMON = \ OBJ_COMMON = \
common/common.o \ $(DIR_COMMON)/common.o \
common/arg.o \ $(DIR_COMMON)/arg.o \
common/log.o \ $(DIR_COMMON)/log.o \
common/console.o \ $(DIR_COMMON)/console.o \
common/ngram-cache.o \ $(DIR_COMMON)/ngram-cache.o \
common/sampling.o \ $(DIR_COMMON)/sampling.o \
common/build-info.o \ $(DIR_COMMON)/speculative.o \
common/json-schema-to-grammar.o $(DIR_COMMON)/build-info.o \
$(DIR_COMMON)/json-schema-to-grammar.o
OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
@ -993,7 +1040,6 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef GGML_CUDA ifdef GGML_CUDA
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifndef GGML_MUSA
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH ifndef CUDA_DOCKER_ARCH
@ -1003,7 +1049,6 @@ endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # GGML_MUSA
endif # GGML_CUDA endif # GGML_CUDA
$(info ) $(info )
@ -1040,224 +1085,78 @@ endif
# Build libraries # Build libraries
# #
# ggml # Libraries
LIB_GGML = libggml.so
LIB_GGML_S = libggml.a
ggml/src/ggml.o: \ LIB_LLAMA = libllama.so
ggml/src/ggml.c \ LIB_LLAMA_S = libllama.a
ggml/include/ggml.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-cpu.o: \ LIB_COMMON = libcommon.so
ggml/src/ggml-cpu.c \ LIB_COMMON_S = libcommon.a
ggml/include/ggml.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-alloc.o: \ # Targets
ggml/src/ggml-alloc.c \ BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
ggml/include/ggml.h \
ggml/include/ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-backend.o: \ # Dependency files
ggml/src/ggml-backend.cpp \ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
ggml/src/ggml-backend-impl.h \
ggml/include/ggml.h \
ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml/src/ggml-quants.o: \ # Default target
ggml/src/ggml-quants.c \ all: $(BUILD_TARGETS)
ggml/include/ggml.h \
ggml/src/ggml-quants.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-aarch64.o: \ # force c++ build for source file that have same name as c file
ggml/src/ggml-aarch64.c \ # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
ggml/include/ggml.h \ $(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
ggml/src/ggml-aarch64.h \ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-blas.o: \ # Rules for building object files
ggml/src/ggml-blas.cpp \ $(DIR_GGML)/%.o: $(DIR_GGML)/%.c
ggml/include/ggml-blas.h $(CC) $(CFLAGS) -MMD -c $< -o $@
$(CXX) $(CXXFLAGS) -c $< -o $@
ifndef GGML_NO_LLAMAFILE $(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp
ggml/src/llamafile/sgemm.o: \ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
ggml/src/llamafile/sgemm.cpp \
ggml/src/llamafile/sgemm.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE
ifndef GGML_NO_AMX $(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp
ggml/src/ggml-amx.o: \ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
ggml/src/ggml-amx.cpp \
ggml/include/ggml-amx.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml/src/ggml-amx/mmq.o: \ $(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp
ggml/src/ggml-amx/mmq.cpp \ $(CXX) $(CXXFLAGS) -MMD -c $< -o $@
ggml/src/ggml-amx/mmq.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
ifdef GGML_RPC # Rules for building libraries
ggml/src/ggml-rpc.o: \ $(LIB_GGML): $(OBJ_GGML)
ggml/src/ggml-rpc.cpp \
ggml/include/ggml-rpc.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_RPC
$(LIB_GGML): \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
$(LIB_GGML_S): \ $(LIB_GGML_S): $(OBJ_GGML)
$(OBJ_GGML)
ar rcs $(LIB_GGML_S) $^ ar rcs $(LIB_GGML_S) $^
# llama $(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
src/unicode.o: \
src/unicode.cpp \
src/unicode.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/unicode-data.o: \
src/unicode-data.cpp \
src/unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama.o: \
src/llama.cpp \
src/llama-impl.h \
src/llama-vocab.h \
src/llama-grammar.h \
src/llama-sampling.h \
src/unicode.h \
include/llama.h \
ggml/include/ggml-cuda.h \
ggml/include/ggml-metal.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-vocab.o: \
src/llama-vocab.cpp \
src/llama-vocab.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-grammar.o: \
src/llama-grammar.cpp \
src/llama-grammar.h \
src/llama-impl.h \
src/llama-vocab.h \
src/llama-sampling.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-sampling.o: \
src/llama-sampling.cpp \
src/llama-sampling.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(LIB_LLAMA): \
$(OBJ_LLAMA) \
$(LIB_GGML)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
$(LIB_LLAMA_S): \ $(LIB_LLAMA_S): $(OBJ_LLAMA)
$(OBJ_LLAMA)
ar rcs $(LIB_LLAMA_S) $^ ar rcs $(LIB_LLAMA_S) $^
# common $(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
common/common.o: \
common/common.cpp \
common/common.h \
common/console.h \
common/sampling.h \
common/json.hpp \
common/json-schema-to-grammar.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/arg.o: \
common/arg.cpp \
common/arg.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/log.o: \
common/log.cpp \
common/log.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/sampling.o: \
common/sampling.cpp \
common/sampling.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/console.o: \
common/console.cpp \
common/console.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/json-schema-to-grammar.o: \
common/json-schema-to-grammar.cpp \
common/json-schema-to-grammar.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/ngram-cache.o: \
common/ngram-cache.cpp \
common/ngram-cache.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(LIB_COMMON): \
$(OBJ_COMMON) \
$(LIB_LLAMA) \
$(LIB_GGML)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
$(LIB_COMMON_S): \ $(LIB_COMMON_S): $(OBJ_COMMON)
$(OBJ_COMMON)
ar rcs $(LIB_COMMON_S) $^ ar rcs $(LIB_COMMON_S) $^
clean: # Include dependency files
rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS) -include $(DEP_FILES)
rm -rvf src/*.o
rm -rvf tests/*.o # Clean generated server assets
rm -rvf examples/*.o clean-server-assets:
rm -rvf common/*.o find examples/server -type f -name "*.js.hpp" -delete
rm -rvf *.a find examples/server -type f -name "*.mjs.hpp" -delete
rm -rvf *.dll find examples/server -type f -name "*.css.hpp" -delete
rm -rvf *.so find examples/server -type f -name "*.html.hpp" -delete
rm -rvf *.dot
rm -rvf ggml/*.a # Clean rule
rm -rvf ggml/*.dll clean: clean-server-assets
rm -rvf ggml/*.so rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf ggml/src/*.o rm -rvf *.a *.dll *.so *.dot
rm -rvf ggml/src/llamafile/*.o find ggml src common tests examples pocs -type f -name "*.o" -delete
rm -rvf common/build-info.cpp find ggml src common tests examples pocs -type f -name "*.d" -delete
rm -vrf ggml/src/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-cuda/*.o
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -vrf ggml/src/ggml-amx/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
rm -rvf $(LEGACY_TARGETS_CLEAN)
find examples pocs -type f -name "*.o" -delete
# #
# Examples # Examples
@ -1283,6 +1182,11 @@ llama-infill: examples/infill/infill.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-run: examples/run/run.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-simple: examples/simple/simple.cpp \ llama-simple: examples/simple/simple.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1456,20 +1360,14 @@ llama-server: \
examples/server/utils.hpp \ examples/server/utils.hpp \
examples/server/httplib.h \ examples/server/httplib.h \
examples/server/index.html.hpp \ examples/server/index.html.hpp \
examples/server/completion.js.hpp \
examples/server/loading.html.hpp \ examples/server/loading.html.hpp \
examples/server/deps_daisyui.min.css.hpp \
examples/server/deps_markdown-it.js.hpp \
examples/server/deps_tailwindcss.js.hpp \
examples/server/deps_vue.esm-browser.js.hpp \
common/json.hpp \ common/json.hpp \
common/stb_image.h \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`: # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
examples/server/%.hpp: examples/server/public/% Makefile examples/server/%.hpp: examples/server/public/% FORCE Makefile
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
echo "unsigned char $${NAME}[] = {" && \ echo "unsigned char $${NAME}[] = {" && \
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@ -1507,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift swift: examples/batched.swift
(cd examples/batched.swift; make build) (cd examples/batched.swift; make build)
@ -1563,11 +1469,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.cpp \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp \ tests/test-opt: tests/test-opt.cpp \
$(OBJ_GGML) $(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1649,7 +1550,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
# #
# Mark legacy binary targets as .PHONY so that they are always checked. # Mark legacy binary targets as .PHONY so that they are always checked.
.PHONY: main quantize perplexity embedding server .PHONY: FORCE main quantize perplexity embedding server
# Define the object file target # Define the object file target
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp

View file

@ -2,49 +2,6 @@
import PackageDescription import PackageDescription
var sources = [
"src/llama.cpp",
"src/llama-vocab.cpp",
"src/llama-grammar.cpp",
"src/llama-sampling.cpp",
"src/unicode.cpp",
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-cpu.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-aarch64.c",
]
var resources: [Resource] = []
var linkerSettings: [LinkerSetting] = []
var cSettings: [CSetting] = [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.unsafeFlags(["-fno-objc-arc"]),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
]
#if canImport(Darwin)
sources.append("ggml/src/ggml-metal.m")
resources.append(.process("ggml/src/ggml-metal.metal"))
linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append(
contentsOf: [
.define("GGML_USE_ACCELERATE"),
.define("GGML_USE_METAL")
]
)
#endif
#if os(Linux)
cSettings.append(.define("_GNU_SOURCE"))
#endif
let package = Package( let package = Package(
name: "llama", name: "llama",
platforms: [ platforms: [
@ -57,24 +14,6 @@ let package = Package(
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
targets: [ targets: [
.target( .systemLibrary(name: "llama", pkgConfig: "llama"),
name: "llama", ]
path: ".",
exclude: [
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"Makefile"
],
sources: sources,
resources: resources,
publicHeadersPath: "spm-headers",
cSettings: cSettings,
linkerSettings: linkerSettings
)
],
cxxLanguageStandard: .cxx11
) )

626
README.md
View file

@ -4,7 +4,6 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -26,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Description ## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
variety of hardware - locally and in the cloud. range of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies - Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
@ -36,14 +35,17 @@ variety of hardware - locally and in the cloud.
- Vulkan and SYCL backend support - Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
improved significantly thanks to many contributions. It is the main playground for developing new features for the
[ggml](https://github.com/ggerganov/ggml) library.
**Supported models:** <details>
<summary>Models</summary>
Typically finetunes of the base models below are supported as well. Typically finetunes of the base models below are supported as well.
Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
#### Text-only
- [X] LLaMA 🦙 - [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙 - [x] LLaMA 2 🦙🦙
- [x] LLaMA 3 🦙🦙🦙 - [x] LLaMA 3 🦙🦙🦙
@ -67,6 +69,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
- [x] [GPT-2](https://huggingface.co/gpt2) - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
- [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
@ -79,6 +82,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo) - [x] [OLMo](https://allenai.org/olmo)
- [x] [OLMo 2](https://allenai.org/olmo)
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924) - [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
@ -95,10 +99,10 @@ Typically finetunes of the base models below are supported as well.
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) #### Multimodal
**Multimodal models:**
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava) - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@ -109,8 +113,12 @@ Typically finetunes of the base models below are supported as well.
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
**Bindings:** </details>
<details>
<summary>Bindings</summary>
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
@ -131,321 +139,339 @@ Typically finetunes of the base models below are supported as well.
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
**UI:** </details>
Unless otherwise noted these projects are open-source with permissive licensing: <details>
<summary>UIs</summary>
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
- [ollama/ollama](https://github.com/ollama/ollama)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [RAGNA Desktop](https://ragna.app/) (proprietary)
- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
**Tools:** - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
- [LARS](https://github.com/abgulati/LARS) (AGPL)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [MindMac](https://mindmac.app) (proprietary)
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [semperai/amica](https://github.com/semperai/amica) (MIT)
- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
</details>
<details>
<summary>Tools</summary>
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML - [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) download models from the Ollama library to be used directly with llama.cpp - [akx/ollama-dl](https://github.com/akx/ollama-dl) download models from the Ollama library to be used directly with llama.cpp
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example) - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
**Infrastructure:** </details>
<details>
<summary>Infrastructure</summary>
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
</details>
<details>
<summary>Games</summary>
**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
## Demo
<details>
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
```
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
I llama.cpp build info:
I UNAME_S: Darwin
I UNAME_P: arm
I UNAME_M: arm64
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
make: Nothing to be done for `default'.
main: build = 1041 (cf658ad)
main: seed = 1692823051
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
llama_model_loader: - type f32: 81 tensors
llama_model_loader: - type q4_0: 281 tensors
llama_model_loader: - type q6_K: 1 tensors
llm_load_print_meta: format = GGUF V1 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_ctx = 512
llm_load_print_meta: n_embd = 5120
llm_load_print_meta: n_head = 40
llm_load_print_meta: n_head_kv = 40
llm_load_print_meta: n_layer = 40
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: f_norm_eps = 1.0e-05
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: n_ff = 13824
llm_load_print_meta: freq_base = 10000.0
llm_load_print_meta: freq_scale = 1
llm_load_print_meta: model type = 13B
llm_load_print_meta: model ftype = mostly Q4_0
llm_load_print_meta: model size = 13.02 B
llm_load_print_meta: general.name = LLaMA v2
llm_load_print_meta: BOS token = 1 '<s>'
llm_load_print_meta: EOS token = 2 '</s>'
llm_load_print_meta: UNK token = 0 '<unk>'
llm_load_print_meta: LF token = 13 '<0x0A>'
llm_load_tensors: ggml ctx size = 0.11 MB
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
...................................................................................................
llama_new_context_with_model: kv self size = 400.00 MB
llama_new_context_with_model: compute buffer total size = 75.41 MB
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
Building a website can be done in 10 simple steps:
Step 1: Find the right website platform.
Step 2: Choose your domain name and hosting plan.
Step 3: Design your website layout.
Step 4: Write your website content and add images.
Step 5: Install security features to protect your site from hackers or spammers
Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
Step 7: Test it again with people who are not related to you personally friends or family members will work just fine!
Step 8: Start marketing and promoting the website via social media channels or paid ads
Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
How does a Website Work?
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit whether its an image or text file (like PDFs). In order for someone elses browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
How to
llama_print_timings: load time = 576.45 ms
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
llama_print_timings: total time = 25431.49 ms
```
</details> </details>
<details>
<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
</details>
## Usage
Here are the end-to-end binary build and model conversion steps for most supported models.
### Basic usage
Firstly, you need to get the binary. There are different methods that you can follow:
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
You can run a basic completion using this command:
```bash
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
# Output:
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
See [this page](./examples/main/README.md) for a full list of parameters.
### Conversation mode
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
```bash
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
# Output:
# > hi, who are you?
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
#
# > what is 1+1?
# Easy peasy! The answer to 1+1 is... 2!
```
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
```
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
```
### Web server
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
Example usage:
```bash
./llama-server -m your_model.gguf --port 8080
# Basic web UI can be accessed via browser: http://localhost:8080
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
```
### Interactive mode
> [!NOTE]
> If you prefer basic usage, please consider using conversation mode instead of interactive mode
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example of a few-shot interaction, invoked with the command
```bash
# default arguments using a 7B model
./examples/chat.sh
# advanced chat with a 13B model
./examples/chat-13B.sh
# custom arguments using a 13B model
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
```bash
# Start a new chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Resume that chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Start a different chat with the same prompt/model
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
# Different prompt cache for different prompt/model
PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
```
### Constrained output with grammars
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
```bash
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
```
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
## Build
Please refer to [Build llama.cpp locally](./docs/build.md)
## Supported backends ## Supported backends
| Backend | Target devices | | Backend | Target devices |
| --- | --- | | --- | --- |
| [Metal](./docs/build.md#metal-build) | Apple Silicon | | [Metal](docs/build.md#metal-build) | Apple Silicon |
| [BLAS](./docs/build.md#blas-build) | All | | [BLAS](docs/build.md#blas-build) | All |
| [BLIS](./docs/backend/BLIS.md) | All | | [BLIS](docs/backend/BLIS.md) | All |
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU | | [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
| [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [HIP](docs/build.md#hip) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU | | [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](./docs/build.md#cann) | Ascend NPU | | [CANN](docs/build.md#cann) | Ascend NPU |
## Tools ## Building the project
### Prepare and Quantize The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
> [!NOTE] - Clone this repository and build locally, see [how to build](docs/build.md)
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours. - On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
- Use a Docker image, see [documentation for Docker](docs/docker.md)
- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. ## Obtaining and quantizing models
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives. The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md) - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
### Perplexity (measuring model quality) You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better). After downloading a model, use the CLI tools to run it locally - see below.
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
## [`llama-cli`](examples/main)
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
- <details open>
<summary>Run in conversation mode</summary>
Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
```bash
llama-cli -m model.gguf
# > hi, who are you?
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
#
# > what is 1+1?
# Easy peasy! The answer to 1+1 is... 2!
```
</details>
- <details>
<summary>Run in conversation mode with custom chat template</summary>
```bash
# use the "chatml" template (use -h to see the list of supported templates)
llama-cli -m model.gguf -cnv --chat-template chatml
# use a custom template
llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
```
</details>
- <details>
<summary>Run simple text completion</summary>
To disable conversation mode explicitly, use `-no-cnv`
```bash
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
</details>
- <details>
<summary>Constrain the output with a custom grammar</summary>
```bash
llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
# {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
```
The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
</details>
## [`llama-server`](examples/server)
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
- <details open>
<summary>Start a local HTTP server with default configuration on port 8080</summary>
```bash
llama-server -m model.gguf --port 8080
# Basic web UI can be accessed via browser: http://localhost:8080
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
```
</details>
- <details>
<summary>Support multiple-users and parallel decoding</summary>
```bash
# up to 4 concurrent requests, each with 4096 max context
llama-server -m model.gguf -c 16384 -np 4
```
</details>
- <details>
<summary>Enable speculative decoding</summary>
```bash
# the draft.gguf model should be a small variant of the target model.gguf
llama-server -m model.gguf -md draft.gguf
```
</details>
- <details>
<summary>Serve an embedding model</summary>
```bash
# use the /embedding endpoint
llama-server -m model.gguf --embedding --pooling cls -ub 8192
```
</details>
- <details>
<summary>Serve a reranking model</summary>
```bash
# use the /reranking endpoint
llama-server -m model.gguf --reranking
```
</details>
- <details>
<summary>Constrain all outputs with a grammar</summary>
```bash
# custom grammar
llama-server -m model.gguf --grammar-file grammar.gbnf
# JSON
llama-server -m model.gguf --grammar-file grammars/json.gbnf
```
</details>
## [`llama-perplexity`](examples/perplexity)
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
- <details open>
<summary>Measure the perplexity over a text file</summary>
```bash
llama-perplexity -m model.gguf -f file.txt
# [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
# Final estimate: PPL = 5.4007 +/- 0.67339
```
</details>
- <details>
<summary>Measure KL divergence</summary>
```bash
# TODO
```
</details>
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](examples/llama-bench)
#### Benchmark the performance of the inference for various parameters.
- <details open>
<summary>Run default benchmark</summary>
```bash
llama-bench -m model.gguf
# Output:
# | model | size | params | backend | threads | test | t/s |
# | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
# | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | pp512 | 5765.41 ± 20.55 |
# | qwen2 1.5B Q4_0 | 885.97 MiB | 1.54 B | Metal,BLAS | 16 | tg128 | 197.71 ± 0.81 |
#
# build: 3e0ba0e60 (4229)
```
</details>
## [`llama-run`](examples/run)
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
- <details>
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
```bash
llama-run granite-code
```
</details>
[^3]: [RamaLama](https://github.com/containers/ramalama)
## [`llama-simple`](examples/simple)
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
- <details>
<summary>Basic text completion</summary>
```bash
llama-simple -m model.gguf
# Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
```
</details>
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
## Contributing ## Contributing
@ -458,22 +484,21 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
## Other documentations ## Other documentation
- [main (cli)](./examples/main/README.md) - [main (cli)](examples/main/README.md)
- [server](./examples/server/README.md) - [server](examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md) - [GBNF grammars](grammars/README.md)
- [GBNF grammars](./grammars/README.md)
**Development documentations** #### Development documentation
- [How to build](./docs/build.md) - [How to build](docs/build.md)
- [Running on Docker](./docs/docker.md) - [Running on Docker](docs/docker.md)
- [Build on Android](./docs/android.md) - [Build on Android](docs/android.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md) - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
**Seminal papers and background on the models** #### Seminal papers and background on the models
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA: - LLaMA:
@ -484,3 +509,6 @@ If your issue is with model generation quality, then please at least scan the fo
- GPT-3.5 / InstructGPT / ChatGPT: - GPT-3.5 / InstructGPT / ChatGPT:
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following) - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
#### References

4
Sources/llama/llama.h Normal file
View file

@ -0,0 +1,4 @@
#pragma once
#include <llama.h>

View file

@ -0,0 +1,5 @@
module llama [system] {
header "llama.h"
link "llama"
export *
}

View file

@ -39,7 +39,7 @@ SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
fi fi
if [ ! -z ${GG_BUILD_CUDA} ]; then if [ ! -z ${GG_BUILD_CUDA} ]; then
@ -326,17 +326,17 @@ function gg_run_open_llama_7b_v2 {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -460,17 +460,17 @@ function gg_run_pythia_1_4b {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -591,17 +591,17 @@ function gg_run_pythia_2_8b {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -815,7 +815,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
ln -sfn ${mnt_models} ${SRC}/models-mnt ln -sfn ${mnt_models} ${SRC}/models-mnt
# Create a fresh python3 venv and enter it # Create a fresh python3 venv and enter it
python3 -m venv "$MNT/venv" if ! python3 -m venv "$MNT/venv"; then
echo "Error: Failed to create Python virtual environment at $MNT/venv."
exit 1
fi
source "$MNT/venv/bin/activate" source "$MNT/venv/bin/activate"
pip install -r ${SRC}/requirements.txt --disable-pip-version-check pip install -r ${SRC}/requirements.txt --disable-pip-version-check

33
cmake/common.cmake Normal file
View file

@ -0,0 +1,33 @@
function(llama_add_compile_flags)
if (LLAMA_FATAL_WARNINGS)
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
list(APPEND C_FLAGS -Werror)
list(APPEND CXX_FLAGS -Werror)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
add_compile_options(/WX)
endif()
endif()
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-Werror=implicit-int -Werror=implicit-function-declaration)
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
list(APPEND C_FLAGS ${WARNING_FLAGS})
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
else()
# todo : msvc
set(C_FLAGS "" PARENT_SCOPE)
set(CXX_FLAGS "" PARENT_SCOPE)
endif()
endif()
endfunction()

View file

@ -3,18 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(GGML_BLAS @GGML_BLAS@) set(GGML_STATIC @GGML_STATIC@)
set(GGML_CUDA @GGML_CUDA@) set(GGML_NATIVE @GGML_NATIVE@)
set(GGML_METAL @GGML_METAL@) set(GGML_LTO @GGML_LTO@)
set(GGML_HIPBLAS @GGML_HIPBLAS@) set(GGML_CCACHE @GGML_CCACHE@)
set(GGML_AVX @GGML_AVX@)
set(GGML_AVX2 @GGML_AVX2@)
set(GGML_AVX512 @GGML_AVX512@)
set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
set(GGML_AMX_TILE @GGML_AMX_TILE@)
set(GGML_AMX_INT8 @GGML_AMX_INT8@)
set(GGML_AMX_BF16 @GGML_AMX_BF16@)
set(GGML_FMA @GGML_FMA@)
set(GGML_LASX @GGML_LASX@)
set(GGML_LSX @GGML_LSX@)
set(GGML_RVV @GGML_RVV@)
set(GGML_SVE @GGML_SVE@)
set(GGML_ACCELERATE @GGML_ACCELERATE@) set(GGML_ACCELERATE @GGML_ACCELERATE@)
set(GGML_VULKAN @GGML_VULKAN@) set(GGML_OPENMP @GGML_OPENMP@)
set(GGML_CPU_HBM @GGML_CPU_HBM@)
set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@)
set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
set(GGML_CUDA_F16 @GGML_CUDA_F16@)
set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@)
set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@)
set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@)
set(GGML_HIP_UMA @GGML_HIP_UMA@)
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
set(GGML_SYCL @GGML_SYCL@) set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@)
set(GGML_OPENMP @GGML_OPENMP@) set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@)
set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@)
set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
set(GGML_METAL_STD @GGML_METAL_STD@)
set(GGML_SYCL_F16 @GGML_SYCL_F16@)
set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
@PACKAGE_INIT@ @PACKAGE_INIT@
@ -22,65 +64,111 @@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
# Ensure transient dependencies satisfied
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
if (APPLE AND GGML_ACCELERATE) set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) set(_llama_link_deps "")
set(_llama_link_opts "")
foreach(_ggml_lib ggml ggml-base)
string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
find_library(${_ggml_lib_var} ${_ggml_lib}
REQUIRED
HINTS ${LLAMA_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH
)
list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
message(STATUS "Found ${${_ggml_lib_var}}")
endforeach()
foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
string(TOUPPER "GGML_${backend}" backend_id)
set(_ggml_lib "ggml-${backend}")
string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
find_library(${_ggml_lib_var} ${_ggml_lib}
HINTS ${LLAMA_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH
)
if(${_ggml_lib_var})
list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
set(${backend_id} ON)
message(STATUS "Found backend ${${_ggml_lib_var}}")
else()
set(${backend_id} OFF)
endif()
endforeach()
if (NOT LLAMA_SHARED_LIB)
if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
endif()
if (GGML_OPENMP)
find_package(OpenMP REQUIRED)
list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
endif()
if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED)
list(APPEND _llama_link_deps memkind)
endif()
if (GGML_BLAS)
find_package(BLAS REQUIRED)
list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
endif()
if (GGML_CUDA)
find_package(CUDAToolkit REQUIRED)
endif()
if (GGML_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
endif()
if (GGML_VULKAN)
find_package(Vulkan REQUIRED)
list(APPEND _llama_link_deps Vulkan::Vulkan)
endif()
if (GGML_HIP)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
endif()
if (GGML_SYCL)
find_package(DNNL)
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND _llama_link_deps DNNL::dnnl)
endif()
if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
endif()
endif()
endif() endif()
if (GGML_BLAS)
find_package(BLAS REQUIRED)
endif()
if (GGML_CUDA)
find_package(CUDAToolkit REQUIRED)
endif()
if (GGML_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
endif()
if (GGML_VULKAN)
find_package(Vulkan REQUIRED)
endif()
if (GGML_HIPBLAS)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
endif()
if (GGML_SYCL)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
endif()
if (GGML_OPENMP)
find_package(OpenMP REQUIRED)
endif()
find_library(ggml_LIBRARY ggml
REQUIRED
HINTS ${LLAMA_LIB_DIR})
find_library(llama_LIBRARY llama find_library(llama_LIBRARY llama
REQUIRED REQUIRED
HINTS ${LLAMA_LIB_DIR}) HINTS ${LLAMA_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@") )
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
add_library(llama UNKNOWN IMPORTED) add_library(llama UNKNOWN IMPORTED)
set_target_properties(llama set_target_properties(llama
PROPERTIES PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
INTERFACE_LINK_OPTIONS "${_llama_link_opts}"
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${llama_LIBRARY}" IMPORTED_LOCATION "${llama_LIBRARY}"

View file

@ -6,5 +6,5 @@ includedir=${prefix}/include
Name: llama Name: llama
Description: Port of Facebook's LLaMA model in C/C++ Description: Port of Facebook's LLaMA model in C/C++
Version: @PROJECT_VERSION@ Version: @PROJECT_VERSION@
Libs: -L${libdir} -lllama Libs: -L${libdir} -lggml -lggml-base -lllama
Cflags: -I${includedir} Cflags: -I${includedir}

View file

@ -0,0 +1,11 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( arch_c_flags "-march=native" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )

View file

@ -2,6 +2,8 @@
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
llama_add_compile_flags()
# Build info header # Build info header
# #
@ -66,6 +68,8 @@ add_library(${TARGET} STATIC
ngram-cache.h ngram-cache.h
sampling.cpp sampling.cpp
sampling.h sampling.h
speculative.cpp
speculative.h
) )
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
@ -77,12 +81,12 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url # Use curl to download model url
if (LLAMA_CURL) if (LLAMA_CURL)
find_package(CURL REQUIRED) find_package(CURL REQUIRED)
add_definitions(-DLLAMA_USE_CURL) target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS}) include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED) find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif () endif ()
target_include_directories(${TARGET} PUBLIC .) target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_11) target_compile_features (${TARGET} PUBLIC cxx_std_17)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

File diff suppressed because it is too large Load diff

View file

@ -12,6 +12,7 @@
struct common_arg { struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON}; std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::set<enum llama_example> excludes = {};
std::vector<const char *> args; std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value const char * value_hint_2 = nullptr; // for second arg value
@ -53,9 +54,11 @@ struct common_arg {
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
common_arg & set_examples(std::initializer_list<enum llama_example> examples); common_arg & set_examples(std::initializer_list<enum llama_example> examples);
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
common_arg & set_env(const char * env); common_arg & set_env(const char * env);
common_arg & set_sparam(); common_arg & set_sparam();
bool in_example(enum llama_example ex); bool in_example(enum llama_example ex);
bool is_exclude(enum llama_example ex);
bool get_value_from_env(std::string & output); bool get_value_from_env(std::string & output);
bool has_value_from_env(); bool has_value_from_env();
std::string to_string(); std::string to_string();

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,7 @@
#pragma once #pragma once
#include "llama.h" #include "llama-cpp.h"
#include <string> #include <string>
#include <vector> #include <vector>
@ -24,20 +24,20 @@
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct common_lora_adapter_info { struct common_adapter_lora_info {
std::string path; std::string path;
float scale; float scale;
struct llama_adapter_lora * ptr;
}; };
struct common_lora_adapter_container : common_lora_adapter_info { using llama_tokens = std::vector<llama_token>;
struct llama_lora_adapter * adapter;
};
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT; extern const char * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER; extern const char * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET; extern const char * LLAMA_BUILD_TARGET;
struct common_control_vector_load_info; struct common_control_vector_load_info;
@ -78,6 +78,7 @@ enum llama_example {
LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };
@ -93,6 +94,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_TEMPERATURE = 7, COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
COMMON_SAMPLER_TYPE_XTC = 8, COMMON_SAMPLER_TYPE_XTC = 8,
COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
}; };
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
@ -101,8 +103,14 @@ enum dimre_method {
DIMRE_METHOD_MEAN, DIMRE_METHOD_MEAN,
}; };
// sampler parameters enum common_conversation_mode {
struct common_sampler_params { COMMON_CONVERSATION_MODE_DISABLED = 0,
COMMON_CONVERSATION_MODE_ENABLED = 1,
COMMON_CONVERSATION_MODE_AUTO = 2,
};
// sampling parameters
struct common_params_sampling {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_prev = 64; // number of previous tokens to remember int32_t n_prev = 64; // number of previous tokens to remember
@ -128,14 +136,15 @@ struct common_sampler_params {
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool timing_per_token = false;
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
std::vector<enum common_sampler_type> samplers = { std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY, COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TYPICAL_P,
@ -153,21 +162,39 @@ struct common_sampler_params {
std::string print() const; std::string print() const;
}; };
struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
std::string model = ""; // draft model for speculative decoding // NOLINT
};
struct common_params_vocoder {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
};
struct common_params { struct common_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size int32_t n_ctx = 4096; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width int32_t grp_attn_w = 512; // group-attention width
int32_t n_print = -1; // print token count every n tokens (-1 = disabled) int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@ -178,28 +205,35 @@ struct common_params {
float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
struct cpu_params cpuparams; struct cpu_params cpuparams;
struct cpu_params cpuparams_batch; struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;
ggml_backend_sched_eval_callback cb_eval = nullptr; ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr; void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
struct common_sampler_params sparams; struct common_params_sampling sampling;
struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
std::string model = ""; // model path // NOLINT std::string model = ""; // model path // NOLINT
std::string model_draft = ""; // draft model for speculative decoding // NOLINT std::string model_alias = ""; // model alias // NOLINT
std::string model_alias = "unknown"; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT std::string hf_repo = ""; // HF repo // NOLINT
@ -209,7 +243,6 @@ struct common_params {
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT
@ -219,8 +252,8 @@ struct common_params {
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -248,7 +281,6 @@ struct common_params {
bool special = false; // enable special token output bool special = false; // enable special token output
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool interactive_first = false; // wait for user input immediately bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@ -271,8 +303,10 @@ struct common_params {
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
std::string cache_type_k = "f16"; // KV cache data type for the K ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT std::string mmproj = ""; // path to multimodal projector // NOLINT
@ -422,6 +456,16 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts; return parts;
} }
static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool string_ends_with(const std::string & str,
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
@ -444,25 +488,41 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils // Model utils
// //
// note: defines object's lifetime
struct common_init_result { struct common_init_result {
struct llama_model * model = nullptr; llama_model_ptr model;
struct llama_context * context = nullptr; llama_context_ptr context;
std::vector<common_lora_adapter_container> lora_adapters;
std::vector<llama_adapter_lora_ptr> lora;
}; };
struct common_init_result common_init_from_params(common_params & params); struct common_init_result common_init_from_params(common_params & params);
struct llama_model_params common_model_params_to_llama (const common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * common_load_model_from_url(
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);
// clear LoRA adapters from context, then apply new list of adapters // clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters); void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
//
// Batch utils // Batch utils
//
void common_batch_clear(struct llama_batch & batch); void common_batch_clear(struct llama_batch & batch);
@ -473,6 +533,16 @@ void common_batch_add(
const std::vector<llama_seq_id> & seq_ids, const std::vector<llama_seq_id> & seq_ids,
bool logits); bool logits);
//
// Token utils
//
// longest common prefix
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
// longet common subsequence
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
// //
// Vocab utils // Vocab utils
// //
@ -486,7 +556,7 @@ std::vector<llama_token> common_tokenize(
bool parse_special = false); bool parse_special = false);
std::vector<llama_token> common_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_model * model, const struct llama_vocab * vocab,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false);
@ -498,11 +568,21 @@ std::string common_token_to_piece(
llama_token token, llama_token token,
bool special = true); bool special = true);
std::string common_token_to_piece(
const struct llama_vocab * vocab,
llama_token token,
bool special = true);
// detokenizes a vector of tokens into a string // detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode` // should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens // optionally renders special/control tokens
std::string common_detokenize( std::string common_detokenize(
llama_context * ctx, const struct llama_context * ctx,
const std::vector<llama_token> & tokens,
bool special = true);
std::string common_detokenize(
const struct llama_vocab * vocab,
const std::vector<llama_token> & tokens, const std::vector<llama_token> & tokens,
bool special = true); bool special = true);
@ -516,6 +596,9 @@ struct common_chat_msg {
std::string content; std::string content;
}; };
// Get the built-in chat template for the model. Return empty string if not present.
std::string common_get_builtin_chat_template(const struct llama_model * model);
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl); bool common_chat_verify_template(const std::string & tmpl);
@ -552,7 +635,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
// Embedding utils // Embedding utils
// //
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); // TODO: repace embd_norm with an enum
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@ -581,18 +665,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
// Split utils // Split utils
// //
static const char * const LLM_KV_SPLIT_NO = "split.no"; namespace {
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
// const char * const LLM_KV_SPLIT_NO = "split.no";
// YAML utils const char * const LLM_KV_SPLIT_COUNT = "split.count";
// const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data); }
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
void yaml_dump_non_result_info(
FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View file

@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) { if (part_static_it == nc_static.end()) {
return -1; return LLAMA_TOKEN_NULL;
} }
const common_ngram_cache_part part_static = part_static_it->second; const common_ngram_cache_part part_static = part_static_it->second;
int max_count_static = 0; int max_count_static = 0;
int sum_count_static = 0; int sum_count_static = 0;
llama_token max_token = -1; llama_token max_token = LLAMA_TOKEN_NULL;
for (std::pair<llama_token, int> token_count_static : part_static) { for (std::pair<llama_token, int> token_count_static : part_static) {
const llama_token token = token_count_static.first; const llama_token token = token_count_static.first;
@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
} }
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
return -1; return LLAMA_TOKEN_NULL;
} }
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
return -1; return LLAMA_TOKEN_NULL;
} }
return max_token; return max_token;
} }
@ -98,9 +98,9 @@ static llama_token try_draft(
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static, common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) { const int * min_sample_size, const int * min_percent) {
llama_token drafted_token = -1; llama_token drafted_token = LLAMA_TOKEN_NULL;
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
const common_ngram ngram_primary = ngrams_primary[i]; const common_ngram ngram_primary = ngrams_primary[i];
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
@ -112,7 +112,7 @@ static llama_token try_draft(
int max_count_primary = 0; int max_count_primary = 0;
int max_count_static = 0; int max_count_static = 0;
int sum_count_primary = 0; int sum_count_primary = 0;
llama_token max_token = -1; llama_token max_token = LLAMA_TOKEN_NULL;
for (std::pair<llama_token, int> token_count_primary : part_primary) { for (std::pair<llama_token, int> token_count_primary : part_primary) {
const llama_token token = token_count_primary.first; const llama_token token = token_count_primary.first;
@ -154,7 +154,7 @@ void common_ngram_cache_draft(
} }
while ((int) draft.size()-1 < n_draft) { while ((int) draft.size()-1 < n_draft) {
llama_token drafted_token = -1; llama_token drafted_token = LLAMA_TOKEN_NULL;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
common_ngram ngram_static; common_ngram ngram_static;
@ -177,17 +177,17 @@ void common_ngram_cache_draft(
} }
ngrams_cd.push_back(ngram_cd); ngrams_cd.push_back(ngram_cd);
} }
if (drafted_token == -1) { if (drafted_token == LLAMA_TOKEN_NULL) {
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
} }
if (drafted_token == -1) { if (drafted_token == LLAMA_TOKEN_NULL) {
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
} }
if (drafted_token == -1) { if (drafted_token == LLAMA_TOKEN_NULL) {
drafted_token = try_draft(nc_static, ngram_static); drafted_token = try_draft(nc_static, ngram_static);
} }
if (drafted_token == -1) { if (drafted_token == LLAMA_TOKEN_NULL) {
break; break;
} }

View file

@ -17,13 +17,13 @@ struct common_ngram {
common_ngram() { common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = -1; tokens[i] = LLAMA_TOKEN_NULL;
} }
} }
common_ngram(const llama_token * input, const int ngram_size) { common_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : -1; tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
} }
} }

View file

@ -99,7 +99,7 @@ struct ring_buffer {
}; };
struct common_sampler { struct common_sampler {
common_sampler_params params; common_params_sampling params;
struct llama_sampler * grmr; struct llama_sampler * grmr;
struct llama_sampler * chain; struct llama_sampler * chain;
@ -113,7 +113,10 @@ struct common_sampler {
void set_logits(struct llama_context * ctx, int idx) { void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx); const auto * logits = llama_get_logits_ith(ctx, idx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
cur.resize(n_vocab); cur.resize(n_vocab);
@ -125,7 +128,7 @@ struct common_sampler {
} }
}; };
std::string common_sampler_params::print() const { std::string common_params_sampling::print() const {
char result[1024]; char result[1024];
snprintf(result, sizeof(result), snprintf(result, sizeof(result),
@ -141,14 +144,16 @@ std::string common_sampler_params::print() const {
return std::string(result); return std::string(result);
} }
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf; lparams.no_perf = params.no_perf;
auto * result = new common_sampler { auto * result = new common_sampler {
/* .params = */ params, /* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), /* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams), /* .chain = */ llama_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)), /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {}, /* .cur = */ {},
@ -157,36 +162,24 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias( llama_sampler_init_logit_bias(
llama_n_vocab(model), llama_vocab_n_tokens(vocab),
params.logit_bias.size(), params.logit_bias.size(),
params.logit_bias.data())); params.logit_bias.data()));
llama_sampler_chain_add(result->chain,
llama_sampler_init_penalties(
llama_n_vocab (model),
llama_token_eos(model),
llama_token_nl (model),
params.penalty_last_n,
params.penalty_repeat,
params.penalty_freq,
params.penalty_present,
params.penalize_nl,
params.ignore_eos));
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) { for (const auto & cnstr : params.samplers) {
switch (cnstr) { switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY: case COMMON_SAMPLER_TYPE_DRY:
{ {
std::vector<const char*> c_breakers; std::vector<const char *> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size()); c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto& str : params.dry_sequence_breakers) { for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str()); c_breakers.push_back(str.c_str());
} }
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
} }
break; break;
case COMMON_SAMPLER_TYPE_TOP_K: case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break; break;
@ -206,7 +199,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break; break;
case COMMON_SAMPLER_TYPE_INFILL: case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break; break;
default: default:
GGML_ASSERT(false && "unknown sampler type"); GGML_ASSERT(false && "unknown sampler type");
@ -215,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) { } else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) { } else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
@ -320,6 +316,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
return cur_p.data[cur_p.selected].id; return cur_p.data[cur_p.selected].id;
} }
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
std::vector<llama_token> result;
result.reserve(idxs.size());
size_t i = 0;
for (; i < draft.size(); i++) {
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
common_sampler_accept(gsmpl, id, true);
result.push_back(id);
if (draft[i] != id) {
break;
}
}
if (i == draft.size()) {
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
common_sampler_accept(gsmpl, id, true);
result.push_back(id);
}
return result;
}
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
std::vector<int> idxs(draft.size() + 1);
for (size_t i = 0; i < idxs.size(); ++i) {
idxs[i] = i;
}
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
}
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
return llama_sampler_get_seed(gsmpl->chain); return llama_sampler_get_seed(gsmpl->chain);
} }
@ -376,6 +411,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_XTC: return 'x';
case COMMON_SAMPLER_TYPE_INFILL: return 'i'; case COMMON_SAMPLER_TYPE_INFILL: return 'i';
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
default : return '?'; default : return '?';
} }
} }
@ -390,6 +426,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_XTC: return "xtc";
case COMMON_SAMPLER_TYPE_INFILL: return "infill"; case COMMON_SAMPLER_TYPE_INFILL: return "infill";
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
default : return ""; default : return "";
} }
} }
@ -404,6 +441,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "xtc", COMMON_SAMPLER_TYPE_XTC }, { "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "infill", COMMON_SAMPLER_TYPE_INFILL }, { "infill", COMMON_SAMPLER_TYPE_INFILL },
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
}; };
// since samplers names are written multiple ways // since samplers names are written multiple ways
@ -450,6 +488,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
}; };
std::vector<common_sampler_type> samplers; std::vector<common_sampler_type> samplers;

View file

@ -36,7 +36,7 @@ struct common_sampler;
// llama_sampler API overloads // llama_sampler API overloads
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
void common_sampler_free(struct common_sampler * gsmpl); void common_sampler_free(struct common_sampler * gsmpl);
@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
// //
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
// generalized version of common_sampler_sample
//
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
//
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
//
// is equivalent to
//
// common_sampler_sample(gsmpl, ctx, idx);
// common_sampler_accept(gsmpl, token, true);
//
// requires: idxs.size() == draft.size() + 1
//
// returns at least 1 token, up to idxs.size()
//
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
// helpers // helpers

277
common/speculative.cpp Normal file
View file

@ -0,0 +1,277 @@
#include "speculative.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include <cstring>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
struct common_speculative {
struct llama_context * ctx;
struct common_sampler * smpl;
llama_batch batch;
llama_tokens prompt;
};
struct common_speculative * common_speculative_init(
struct llama_context * ctx_dft) {
auto * result = new common_speculative {
/* .ctx = */ ctx_dft,
/* .smpl = */ nullptr,
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
/* .prompt = */ {},
};
// TODO: optimize or pass from outside?
#if 0
{
common_params_sampling params;
params.no_perf = false;
params.top_k = 40;
params.top_p = 0.9;
params.samplers = {
COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TOP_P,
COMMON_SAMPLER_TYPE_INFILL,
};
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
}
#else
{
common_params_sampling params;
params.no_perf = false;
params.top_k = 10;
params.samplers = {
COMMON_SAMPLER_TYPE_TOP_K,
};
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
}
#endif
return result;
}
void common_speculative_free(struct common_speculative * spec) {
if (spec == nullptr) {
return;
}
common_sampler_free(spec->smpl);
llama_batch_free(spec->batch);
delete spec;
}
bool common_speculative_are_compatible(
const struct llama_context * ctx_tgt,
const struct llama_context * ctx_dft) {
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
const struct llama_model * model_dft = llama_get_model(ctx_dft);
const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
const bool vocab_type_dft = llama_vocab_type(vocab_dft);
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) {
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
return false;
}
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
return false;
}
{
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
__func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return false;
}
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
common_token_to_piece(ctx_tgt, i).c_str(),
common_token_to_piece(ctx_dft, i).c_str());
return false;
}
}
}
return true;
}
llama_tokens common_speculative_gen_draft(
struct common_speculative * spec,
struct common_speculative_params params,
const llama_tokens & prompt_tgt,
llama_token id_last) {
auto & batch = spec->batch;
auto & ctx = spec->ctx;
auto & smpl = spec->smpl;
auto & prompt = spec->prompt;
int reuse_i = 0;
int reuse_n = 0;
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
// reuse as much as possible from the old draft context
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
for (int i = 0; i < (int) prompt.size(); ++i) {
int cur = 0;
while (i_start + cur < (int) prompt_tgt.size() &&
i + cur < (int) prompt.size() &&
prompt_tgt[i_start + cur] == prompt[i + cur]) {
cur++;
}
if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
reuse_i = i;
reuse_n = cur;
}
}
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
llama_tokens result;
result.reserve(params.n_draft);
if (reuse_n == 0) {
llama_kv_cache_clear(ctx);
prompt.clear();
} else {
// this happens when a previous draft has been discarded (for example, due to being too small), but the
// target model agreed with it. in this case, we simply pass back the previous results to save compute
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
result.push_back(prompt[i]);
if (params.n_draft <= (int) result.size()) {
break;
}
}
return result;
}
if (reuse_i > 0) {
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
}
if (reuse_n < (int) prompt.size()) {
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
prompt.erase(prompt.begin() + reuse_n, prompt.end());
}
}
// prepare a batch to evaluate any new tokens in the prompt
common_batch_clear(batch);
for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
prompt.push_back(prompt_tgt[i]);
}
// we should rarely end-up here during normal decoding
if (batch.n_tokens > 0) {
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
llama_decode(ctx, batch);
}
const llama_pos n_past = prompt.size();
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
common_batch_clear(batch);
common_batch_add (batch, id_last, n_past, { 0 }, true);
prompt.push_back(id_last);
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
llama_decode(ctx, batch);
common_sampler_reset(smpl);
// sample n_draft tokens from the draft model
for (int i = 0; i < params.n_draft; ++i) {
common_batch_clear(batch);
common_sampler_sample(smpl, ctx, 0, true);
const auto * cur_p = common_sampler_get_candidates(smpl);
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
}
// add drafted token for each sequence
const llama_token id = cur_p->data[0].id;
// only collect very high-confidence draft tokens
if (cur_p->data[0].p < params.p_min) {
break;
}
common_sampler_accept(smpl, id, true);
result.push_back(id);
if (params.n_draft <= (int) result.size()) {
break;
}
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
// evaluate the drafted tokens on the draft model
llama_decode(ctx, batch);
prompt.push_back(id);
}
return result;
}

28
common/speculative.h Normal file
View file

@ -0,0 +1,28 @@
#pragma once
#include "llama.h"
#include "common.h"
struct common_speculative;
struct common_speculative_params {
int n_draft = 16; // max drafted tokens
int n_reuse = 256;
float p_min = 0.9f; // min probabiliy required to accept a token in the draft
};
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
void common_speculative_free(struct common_speculative * spec);
bool common_speculative_are_compatible(
const struct llama_context * ctx_tgt,
const struct llama_context * ctx_dft);
// sample up to n_draft tokens and add them to the batch using the draft model
llama_tokens common_speculative_gen_draft(
struct common_speculative * spec,
struct common_speculative_params params,
const llama_tokens & prompt,
llama_token id_last);

View file

@ -221,17 +221,17 @@ class Model:
self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_context_length(n_ctx)
logger.info(f"gguf: context length = {n_ctx}") logger.info(f"gguf: context length = {n_ctx}")
n_embd = self.find_hparam(["hidden_size", "n_embd"]) if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
logger.info(f"gguf: embedding length = {n_embd}") logger.info(f"gguf: embedding length = {n_embd}")
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
self.gguf_writer.add_feed_forward_length(n_ff) self.gguf_writer.add_feed_forward_length(n_ff)
logger.info(f"gguf: feed forward length = {n_ff}") logger.info(f"gguf: feed forward length = {n_ff}")
n_head = self.find_hparam(["num_attention_heads", "n_head"]) if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
logger.info(f"gguf: head count = {n_head}") logger.info(f"gguf: head count = {n_head}")
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv) self.gguf_writer.add_head_count_kv(n_head_kv)
@ -296,7 +296,9 @@ class Model:
break break
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
data = data_torch.squeeze().numpy() # TODO: why do we squeeze here?
# data = data_torch.squeeze().numpy()
data = data_torch.numpy()
# if data ends up empty, it means data_torch was a scalar tensor -> restore # if data ends up empty, it means data_torch was a scalar tensor -> restore
if len(data.shape) == 0: if len(data.shape) == 0:
@ -324,6 +326,9 @@ class Model:
gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_W2,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
gguf.MODEL_TENSOR.POSNET_NORM1,
gguf.MODEL_TENSOR.POSNET_NORM2,
) )
) )
or not new_name.endswith(".weight") or not new_name.endswith(".weight")
@ -473,6 +478,11 @@ class Model:
return modelcls return modelcls
return func return func
@classmethod
def print_registered_models(cls):
for name in sorted(cls._model_classes.keys()):
logger.error(f"- {name}")
@classmethod @classmethod
def from_model_architecture(cls, arch: str) -> type[Model]: def from_model_architecture(cls, arch: str) -> type[Model]:
try: try:
@ -525,9 +535,19 @@ class Model:
else: else:
token: str = reverse_vocab[i] token: str = reverse_vocab[i]
if token in added_vocab: if token in added_vocab:
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not tokenizer.added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
toktypes.append(gguf.TokenType.CONTROL) toktypes.append(gguf.TokenType.CONTROL)
else: else:
# NOTE: this was added for Gemma.
# Encoding and decoding the tokens above isn't sufficient for this case.
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
toktypes.append(gguf.TokenType.USER_DEFINED) toktypes.append(gguf.TokenType.USER_DEFINED)
else: else:
@ -571,6 +591,9 @@ class Model:
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
# ref: https://huggingface.co/tiiuae/falcon-7b # ref: https://huggingface.co/tiiuae/falcon-7b
res = "falcon" res = "falcon"
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
res = "falcon3"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5 # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
res = "bert-bge" res = "bert-bge"
@ -658,6 +681,21 @@ class Model:
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
# ref: https://huggingface.co/facebook/chameleon-7b # ref: https://huggingface.co/facebook/chameleon-7b
res = "chameleon" res = "chameleon"
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
res = "minerva-7b"
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
res = "roberta-bpe"
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
res = "gigachat"
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
res = "megrez"
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
res = "deepseek-v3"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -680,6 +718,9 @@ class Model:
return res return res
# Marker: End get_vocab_base_pre # Marker: End get_vocab_base_pre
def _set_vocab_none(self) -> None:
self.gguf_writer.add_tokenizer_model("none")
def _set_vocab_gpt2(self) -> None: def _set_vocab_gpt2(self) -> None:
tokens, toktypes, tokpre = self.get_vocab_base() tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_model("gpt2")
@ -1663,6 +1704,178 @@ class LlamaModel(Model):
raise ValueError(f"Unprocessed experts: {experts}") raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("DeciLMForCausalLM")
class DeciModel(Model):
model_arch = gguf.MODEL_ARCH.DECI
@staticmethod
def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
# DeciLM-specific code
intermediate_size = int(2 * ffn_mult * n_embd / 3)
return DeciModel._find_multiple(intermediate_size, 256)
@staticmethod
def _find_multiple(n: int, k: int) -> int:
# DeciLM-specific code
if n % k == 0:
return n
return n + k - (n % k)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
_block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
assert self.block_count == len(_block_configs)
self._num_kv_heads = list()
self._num_heads = list()
_ffn_multipliers = list()
# ***linear attention layer***
# if n_heads_in_group is None and replace_with_linear is True
# then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
# ***attention-free layer***
# if n_heads_in_group is None and replace_with_linear is False
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0
# ***normal attention-layer***
# if n_heads_in_group is not None, then
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
# _num_heads[il] is num_attention_head
for il in range(len(_block_configs)):
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
if _block_configs[il]["attention"]["replace_with_linear"] is True:
self._num_kv_heads.append(0)
self._num_heads.append(self.hparams["num_attention_heads"])
else:
self._num_kv_heads.append(0)
self._num_heads.append(0)
else:
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
self._num_heads.append(self.hparams["num_attention_heads"])
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
assert self.block_count == len(self._num_kv_heads)
assert self.block_count == len(self._num_heads)
assert self.block_count == len(_ffn_multipliers)
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
self._ffn_dims: list[int] = [
DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
for multiplier in _ffn_multipliers
]
def set_vocab(self):
# Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
# eos_token from '|eot_id|' to '|end_of_text|'
if self.hparams.get("vocab_size", 128256) == 128256:
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
else:
# DeciLM-7B
self._set_vocab_llama_hf()
def set_gguf_parameters(self):
if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
assert self.block_count == len(self._num_kv_heads)
assert self.block_count == len(self._num_heads)
assert self.block_count == len(self._ffn_dims)
if (rope_theta := self.hparams.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta)
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
self.gguf_writer.add_head_count(self._num_heads)
self.gguf_writer.add_feed_forward_length(self._ffn_dims)
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
self.gguf_writer.add_file_type(self.ftype)
else: # DeciLM-7B
super().set_gguf_parameters()
if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
assert self.block_count == len(self._num_kv_heads)
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "head_dim" in hparams:
rope_dim = hparams["head_dim"]
else:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
if bid is not None:
if "num_key_value_heads_per_layer" in self.hparams:
n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
elif "block_configs" in self.hparams:
n_kv_head = self._num_kv_heads[bid]
n_head = self._num_heads[bid]
else:
n_kv_head = self.hparams.get("num_key_value_heads")
else:
n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = DeciModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
return [(self.map_tensor_name(name), data_torch)]
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
factor = rope_scaling.get("factor", 8.0)
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
assert low_freq_wavelen != high_freq_wavelen
rope_factors = []
for freq in freqs:
wavelen = 2 * math.pi / freq
if wavelen < high_freq_wavelen:
rope_factors.append(1)
elif wavelen > low_freq_wavelen:
rope_factors.append(factor)
else:
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
def prepare_tensors(self):
super().prepare_tensors()
@Model.register("BitnetForCausalLM") @Model.register("BitnetForCausalLM")
class BitnetModel(Model): class BitnetModel(Model):
model_arch = gguf.MODEL_ARCH.BITNET model_arch = gguf.MODEL_ARCH.BITNET
@ -1831,29 +2044,40 @@ class MiniCPMModel(Model):
model_arch = gguf.MODEL_ARCH.MINICPM model_arch = gguf.MODEL_ARCH.MINICPM
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"] super().set_gguf_parameters()
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) embedding_scale = float(self.hparams["scale_emb"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_embedding_scale(embedding_scale)
self.gguf_writer.add_block_count(block_count) logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_residual_scale(residual_scale)
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_logit_scale(logit_scale)
self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
if self.hparams.get("rope_scaling") is not None:
if self.hparams["rope_scaling"].get("type") == "longrope":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
def set_vocab(self): def set_vocab(self):
self._set_vocab_llama_hf() self._set_vocab_sentencepiece()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape)
)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused
@ -1863,9 +2087,9 @@ class MiniCPMModel(Model):
# HF models permute some of the tensors, so we need to undo that # HF models permute some of the tensors, so we need to undo that
if name.endswith(("q_proj.weight")): if name.endswith(("q_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight")): if name.endswith(("k_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ -1975,6 +2199,75 @@ class Qwen2Model(Model):
except FileNotFoundError: except FileNotFoundError:
self._set_vocab_gpt2() self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
@Model.register("Qwen2VLForConditionalGeneration")
class Qwen2VLModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2VL
def set_gguf_parameters(self):
super().set_gguf_parameters()
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
mrope_section += [0] * max(0, 4 - len(mrope_section))
self.gguf_writer.add_rope_dimension_sections(mrope_section)
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
for name, data in super().get_tensors():
if name.startswith("visual."):
continue
yield name, data
@Model.register("WavTokenizerDec")
class WavTokenizerDecModel(Model):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if \
name.endswith("codebook.cluster_size") or \
name.endswith("codebook.embed_avg") or \
name.endswith("codebook.inited"):
logger.debug(f"Skipping {name!r}")
return []
logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
return [(self.map_tensor_name(name), data_torch)]
def set_vocab(self):
self._set_vocab_none()
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size (self.hparams["vocab_size"])
self.gguf_writer.add_features_length (self.hparams["n_embd_features"])
self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"])
self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"])
self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"])
self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"])
self.gguf_writer.add_causal_attention(False)
@Model.register("Qwen2MoeForCausalLM") @Model.register("Qwen2MoeForCausalLM")
class Qwen2MoeModel(Model): class Qwen2MoeModel(Model):
@ -2104,6 +2397,15 @@ class Phi3MiniModel(Model):
model_arch = gguf.MODEL_ARCH.PHI3 model_arch = gguf.MODEL_ARCH.PHI3
def set_vocab(self): def set_vocab(self):
# Phi-4 model uses GPT2Tokenizer
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
tokenizer_class = tokenizer_config_json['tokenizer_class']
if tokenizer_class == 'GPT2Tokenizer':
return self._set_vocab_gpt2()
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model' tokenizer_path = self.dir_model / 'tokenizer.model'
@ -2220,7 +2522,11 @@ class Phi3MiniModel(Model):
self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_dimension_count(rope_dims)
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) sliding_window = self.hparams.get("sliding_window")
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
if sliding_window is None:
sliding_window = 0
self.gguf_writer.add_sliding_window(sliding_window)
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_embd = self.find_hparam(["hidden_size", "n_embd"])
@ -2262,6 +2568,63 @@ class Phi3MiniModel(Model):
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
@Model.register("PhiMoEForCausalLM")
class PhiMoeModel(Phi3MiniModel):
model_arch = gguf.MODEL_ARCH.PHIMOE
_experts: list[dict[str, Tensor]] | None = None
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.hparams["num_local_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["w1", "w2", "w3"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("PlamoForCausalLM") @Model.register("PlamoForCausalLM")
class PlamoModel(Model): class PlamoModel(Model):
model_arch = gguf.MODEL_ARCH.PLAMO model_arch = gguf.MODEL_ARCH.PLAMO
@ -2519,7 +2882,7 @@ class InternLM2Model(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("BertModel", "CamembertModel") @Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
class BertModel(Model): class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT model_arch = gguf.MODEL_ARCH.BERT
@ -2560,7 +2923,8 @@ class BertModel(Model):
# we need this to validate the size of the token_type embeddings # we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings # though currently we are passing all zeros to the token_type embeddings
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" # "Sequence A" or "Sequence B"
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
# convert to phantom space vocab # convert to phantom space vocab
def phantom(tok): def phantom(tok):
@ -2584,13 +2948,73 @@ class BertModel(Model):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused
if name.startswith("bert."):
name = name[5:]
if name.endswith(".gamma"):
name = name[:-6] + ".weight"
if name.endswith(".beta"):
name = name[:-5] + ".bias"
# we are only using BERT for embeddings so we don't need the pooling layer # we are only using BERT for embeddings so we don't need the pooling layer
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
return [] # we don't need these return [] # we don't need these
if name.startswith("cls.predictions"):
return []
if name.startswith("cls.seq_relationship"):
return []
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("RobertaModel")
class RobertaModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# we need the pad_token_id to know how to chop down position_embd matrix
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
self._position_offset = 1 + pad_token_id
if "max_position_embeddings" in self.hparams:
self.hparams["max_position_embeddings"] -= self._position_offset
else:
self._position_offset = None
def set_vocab(self):
"""Support BPE tokenizers for roberta models"""
bpe_tok_path = self.dir_model / "tokenizer.json"
if bpe_tok_path.exists():
self._set_vocab_gpt2()
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(True)
# we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings
# "Sequence A" or "Sequence B"
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
else:
return super().set_vocab()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# if name starts with "roberta.", remove the prefix
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
if name.startswith("roberta."):
name = name[8:]
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
if name == "embeddings.position_embeddings.weight":
if self._position_offset is not None:
data_torch = data_torch[self._position_offset:,:]
return super().modify_tensors(data_torch, name, bid)
@Model.register("NomicBertModel") @Model.register("NomicBertModel")
class NomicBertModel(BertModel): class NomicBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.NOMIC_BERT model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@ -2707,7 +3131,7 @@ class XLMRobertaModel(BertModel):
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_add_space_prefix(add_prefix) self.gguf_writer.add_add_space_prefix(add_prefix)
self.gguf_writer.add_token_type_count(1) self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
if precompiled_charsmap: if precompiled_charsmap:
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
@ -2898,6 +3322,8 @@ class Rwkv6Model(Model):
# required by llama.cpp, unused # required by llama.cpp, unused
self.gguf_writer.add_head_count(0) self.gguf_writer.add_head_count(0)
lerp_weights: dict[int, dict[str, Tensor]] = {}
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name) new_name = self.map_tensor_name(name)
@ -2910,14 +3336,87 @@ class Rwkv6Model(Model):
if new_name.endswith("time_mix_w2.weight"): if new_name.endswith("time_mix_w2.weight"):
data_torch = data_torch.permute(0, 2, 1) data_torch = data_torch.permute(0, 2, 1)
rescale_every_n_layers = self.hparams["rescale_every"] if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
if rescale_every_n_layers > 0: data_torch = data_torch.squeeze()
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) try:
rescale_every_n_layers = self.hparams["rescale_every"]
if rescale_every_n_layers > 0:
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
except KeyError:
pass
# concat time_mix_lerp weights to reduce some cpu overhead
# also reduces the number of tensors in the model
if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
try:
self.lerp_weights[bid][new_name] = data_torch
except KeyError:
self.lerp_weights[bid] = {new_name: data_torch}
if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
yield (new_name, data)
return
yield (new_name, data_torch) yield (new_name, data_torch)
@Model.register("RWKV6Qwen2ForCausalLM")
class RWKV6Qwen2Model(Rwkv6Model):
model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
num_attention_heads = self.hparams["num_attention_heads"]
num_key_value_heads = self.hparams["num_key_value_heads"]
hidden_size = self.hparams["hidden_size"]
head_size = hidden_size // num_attention_heads
rms_norm_eps = self.hparams["rms_norm_eps"]
intermediate_size = self.hparams["intermediate_size"]
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
# RWKV isn't context limited
self.gguf_writer.add_context_length(1048576)
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_wkv_head_size(head_size)
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype)
# special parameters for time_mixing in RWKV6QWEN2
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_token_shift_count(1)
# RWKV6QWEN2 use grouped key/value like GQA
self.gguf_writer.add_head_count_kv(num_key_value_heads)
# required by llama.cpp, unused
self.gguf_writer.add_head_count(0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
for new_name, data in super().modify_tensors(data_torch, name, bid):
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
data = data.view(5, -1, data.shape[-1])
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
# permute them here to avoid code changes
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
if "w2" in new_name:
data = data.view(5, -1, data.shape[-1])
yield (new_name, data)
continue
yield (new_name, data)
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(Model): class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA model_arch = gguf.MODEL_ARCH.MAMBA
@ -3012,6 +3511,24 @@ class CommandR2Model(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
@Model.register("Cohere2ForCausalLM")
class Cohere2Model(Model):
model_arch = gguf.MODEL_ARCH.COHERE2
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
rotary_pct = self.hparams["rotary_pct"]
hidden_size = self.hparams["hidden_size"]
num_attention_heads = self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
@Model.register("OlmoForCausalLM") @Model.register("OlmoForCausalLM")
@Model.register("OLMoForCausalLM") @Model.register("OLMoForCausalLM")
class OlmoModel(Model): class OlmoModel(Model):
@ -3040,6 +3557,11 @@ class OlmoModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("Olmo2ForCausalLM")
class Olmo2Model(Model):
model_arch = gguf.MODEL_ARCH.OLMO2
@Model.register("OlmoeForCausalLM") @Model.register("OlmoeForCausalLM")
class OlmoeModel(Model): class OlmoeModel(Model):
model_arch = gguf.MODEL_ARCH.OLMOE model_arch = gguf.MODEL_ARCH.OLMOE
@ -3373,7 +3895,99 @@ class ArcticModel(Model):
raise ValueError(f"Unprocessed experts: {experts}") raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("DeepseekForCausalLM")
class DeepseekModel(Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
if "head_dim" in hparams:
rope_dim = hparams["head_dim"]
else:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_weights_scale(1.0)
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
_experts: list[dict[str, Tensor]] | None = None
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("DeepseekV2ForCausalLM") @Model.register("DeepseekV2ForCausalLM")
@Model.register("DeepseekV3ForCausalLM")
class DeepseekV2Model(Model): class DeepseekV2Model(Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2 model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@ -3395,6 +4009,15 @@ class DeepseekV2Model(Model):
self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
if hparams["scoring_func"] == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif hparams["scoring_func"] == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
else:
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
@ -3407,6 +4030,16 @@ class DeepseekV2Model(Model):
_experts: list[dict[str, Tensor]] | None = None _experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# rename e_score_correction_bias tensors
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
# skip Multi-Token Prediction (MTP) layers
block_count = self.hparams["num_hidden_layers"]
match = re.match(r"model.layers.(\d+)", name)
if match and int(match.group(1)) >= block_count:
return []
# process the experts separately # process the experts separately
if name.find("mlp.experts") != -1: if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"] n_experts = self.hparams["n_routed_experts"]
@ -4301,6 +4934,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"model", type=Path, "model", type=Path,
help="directory containing model file", help="directory containing model file",
nargs="?",
) )
parser.add_argument( parser.add_argument(
"--use-temp-file", action="store_true", "--use-temp-file", action="store_true",
@ -4338,8 +4972,15 @@ def parse_args() -> argparse.Namespace:
"--metadata", type=Path, "--metadata", type=Path,
help="Specify the path for an authorship metadata override file" help="Specify the path for an authorship metadata override file"
) )
parser.add_argument(
"--print-supported-models", action="store_true",
help="Print the supported models"
)
return parser.parse_args() args = parser.parse_args()
if not args.print_supported_models and args.model is None:
parser.error("the following arguments are required: model")
return args
def split_str_to_n_bytes(split_str: str) -> int: def split_str_to_n_bytes(split_str: str) -> int:
@ -4363,6 +5004,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
def main() -> None: def main() -> None:
args = parse_args() args = parse_args()
if args.print_supported_models:
logger.error("Supported models:")
Model.print_registered_models()
sys.exit(0)
if args.verbose: if args.verbose:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
else: else:

View file

@ -17,7 +17,7 @@
# #
# python3 convert_hf_to_gguf_update.py <huggingface_token> # python3 convert_hf_to_gguf_update.py <huggingface_token>
# #
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
# - Update llama.cpp with the new pre-tokenizer if necessary # - Update llama.cpp with the new pre-tokenizer if necessary
# #
# TODO: generate tokenizer tests for llama.cpp # TODO: generate tokenizer tests for llama.cpp
@ -72,6 +72,7 @@ models = [
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
@ -102,6 +103,11 @@ models = [
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
] ]

View file

@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "") base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight")
# models produced by mergekit-extract-lora have token embeddings in the adapter
base_name = base_name.replace(".lora_embedding_A", ".weight")
base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name return base_name
@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
"--base", type=Path, "--base", type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
) )
parser.add_argument(
"--base-model-id", type=str,
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
)
parser.add_argument( parser.add_argument(
"lora_path", type=Path, "lora_path", type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@ -290,6 +297,7 @@ if __name__ == '__main__':
dir_base_model: Path | None = args.base dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path dir_lora: Path = args.lora_path
base_model_id: str | None = args.base_model_id
lora_config = dir_lora / "adapter_config.json" lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors" input_model = dir_lora / "adapter_model.safetensors"
@ -313,7 +321,10 @@ if __name__ == '__main__':
lparams: dict[str, Any] = json.load(f) lparams: dict[str, Any] = json.load(f)
# load base model # load base model
if dir_base_model is None: if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams: if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"] model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}") logger.info(f"Loading base model from Hugging Face: {model_id}")
@ -371,11 +382,16 @@ if __name__ == '__main__':
if self.lazy: if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor) tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name) base_name = get_base_tensor_name(name)
is_lora_a = ".lora_A.weight" in name # note: mergekit-extract-lora also adds token embeddings to the adapter
is_lora_b = ".lora_B.weight" in name is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
if not is_lora_a and not is_lora_b: if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name: if ".base_layer.weight" in name:
continue continue
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
if "_layernorm" in name or ".norm" in name:
yield (base_name, tensor)
continue
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
if ".embed_tokens.weight" in name or ".lm_head.weight" in name: if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
@ -407,9 +423,21 @@ if __name__ == '__main__':
if name == "lm_head.weight" and len(dest) == 0: if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model") raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
# mergekit-extract-lora add these layernorm to the adapter
if "_norm" in dest_name:
assert dest_data.dim() == 1
yield (dest_name, dest_data)
continue
# otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()
# note: mergekit-extract-lora flip and transpose A and B
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
if "token_embd.weight" in dest_name:
lora_a = lora_a.T
yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b) yield (dest_name + ".lora_b", lora_b)

View file

@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and: Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
``` ```
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" $ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
``` ```
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone: To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:

View file

@ -27,13 +27,6 @@ We recommend using openmp since it's easier to modify the cores being used.
### llama.cpp compilation ### llama.cpp compilation
Makefile:
```bash
make GGML_BLIS=1 -j
# make GGML_BLIS=1 llama-benchmark-matmult
```
CMake: CMake:
```bash ```bash

View file

@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
## News ## News
- 2024.11
- Support F16 and F32 data type model for Ascend 310P NPU.
- 2024.8 - 2024.8
- Support `Q4_0` and `Q8_0` data type for Ascend NPU. - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7 - 2024.7
@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
### Ascend NPU ### Ascend NPU
**Verified devices** **Verified devices**
| Ascend NPU | Status | | Ascend NPU | Status |
|:-----------------------------:|:-------:| |:-----------------------------:|:-------:|
| Atlas 300T A2 | Support | | Atlas 300T A2 | Support |
| Atlas 300I Duo | Support |
*Notes:* *Notes:*

View file

@ -34,13 +34,16 @@ The SYCL backend would be broken by some PRs due to no online CI.
The following release is verified with good quality: The following release is verified with good quality:
|Commit ID|Tag|Release|Verified Platform| |Commit ID|Tag|Release|Verified Platform| Update date|
|-|-|-|-| |-|-|-|-|-|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
## News ## News
- 2024.11
- Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
- 2024.8 - 2024.8
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs. - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
@ -310,12 +313,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL # Build LLAMA with Nvidia BLAS acceleration through SYCL
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16 # Option 2: Use FP16
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
# build all binary # build all binary
cmake --build build --config Release -j -v cmake --build build --config Release -j -v
@ -333,8 +338,9 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE
## AMD ## AMD
# Use FP32, FP16 is not supported # Use FP32, FP16 is not supported
# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:' # Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# build all binary # build all binary
cmake --build build --config Release -j -v cmake --build build --config Release -j -v
@ -644,6 +650,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|--------------------|---------------------------------------|---------------------------------------------| |--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |

View file

@ -7,124 +7,75 @@ git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp cd llama.cpp
``` ```
In order to build llama.cpp you have four different options. The following sections describe how to build with different backends and options.
- Using `make`: ## CPU Build
- On Linux or MacOS:
```bash Build llama.cpp using `CMake`:
make
```
- On Windows (x86/x64 only, arm64 requires cmake): ```bash
cmake -B build
cmake --build build --config Release
```
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). **Notes**:
2. Extract `w64devkit` on your pc.
3. Run `w64devkit.exe`.
4. Use the `cd` command to reach the `llama.cpp` folder.
5. From here you can run:
```bash
make
```
- Notes: - For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`. - For faster repeated compilation, install [ccache](https://ccache.dev/)
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel. - For debug builds, there are two cases:
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`: 1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash ```bash
cmake -B build cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
```
cmake -B build -DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release cmake --build build --config Release
``` ```
**Notes**: - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`. - Tab Workload: Desktop-development with C++
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
- For faster repeated compilation, install [ccache](https://ccache.dev/). - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
- For debug builds, there are two cases: - For Windows on ARM (arm64, WoA) build with:
```bash
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag): cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release
```
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
For building with ninja generator and clang compiler as default:
-set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
```bash ```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug cmake --preset x64-windows-llvm-release
cmake --build build cmake --build build-x64-windows-llvm-release
``` ```
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
- Tab Workload: Desktop-development with C++
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
- For Windows on ARM (arm64, WoA) build with:
```bash
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release
```
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
2. Add your user to **video** group
3. Install compilation dependencies.
```bash
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
## Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument.
## BLAS Build ## BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use: Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:
### Accelerate Framework: ### Accelerate Framework
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions. This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
### OpenBLAS: ### OpenBLAS
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine. This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
- Using `make`:
- On Linux:
```bash
make GGML_OPENBLAS=1
```
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
3. Extract `w64devkit` on your pc.
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
6. Run `w64devkit.exe`.
7. Use the `cd` command to reach the `llama.cpp` folder.
8. From here you can run:
```bash
make GGML_OPENBLAS=1
```
- Using `CMake` on Linux: - Using `CMake` on Linux:
```bash ```bash
@ -136,14 +87,6 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
Check [BLIS.md](./backend/BLIS.md) for more information. Check [BLIS.md](./backend/BLIS.md) for more information.
### SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
### Intel oneMKL ### Intel oneMKL
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md). Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
@ -161,16 +104,31 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
### CUDA ### Other BLAS libraries
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors.
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling. ## Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option.
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument.
## SYCL
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
## CUDA
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect.
- Using `make`:
```bash
make GGML_CUDA=1
```
- Using `CMake`: - Using `CMake`:
```bash ```bash
@ -186,24 +144,16 @@ The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | | GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
### MUSA ## MUSA
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa). This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
- Using `make`:
```bash
make GGML_MUSA=1
```
- Using `CMake`: - Using `CMake`:
```bash ```bash
@ -217,20 +167,16 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet. Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
### hipBLAS ## HIP
This provides BLAS acceleration on HIP-supported AMD GPUs. This provides GPU acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed. Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
- Using `make`:
```bash
make GGML_HIPBLAS=1
```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash ```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16 && cmake --build build --config Release -- -j 16
``` ```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
@ -247,19 +193,14 @@ You can download it from your Linux distro's package manager or from here: [ROCm
```bash ```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \ HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \ HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16 && cmake --build build -- -j 16
``` ```
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash ```bash
set PATH=%HIP_PATH%\bin;%PATH% set PATH=%HIP_PATH%\bin;%PATH%
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake --build build cmake --build build
``` ```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@ -268,23 +209,16 @@ You can download it from your Linux distro's package manager or from here: [ROCm
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description | ## Vulkan
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
### Vulkan
**Windows** **Windows**
#### w64devkit ### w64devkit
Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases). Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required. Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies: Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
```sh ```sh
@ -300,18 +234,47 @@ Libs: -lvulkan-1
EOF EOF
``` ```
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
#### MSYS2 Switch into the `llama.cpp` directory and build using CMake.
```sh
cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
### Git Bash MINGW64
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
Download and install [`CMake`](https://cmake.org/download/) with the default settings
Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
```
cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
Now you can load the model in conversation mode using `Vulkan`
```sh
build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
```
### MSYS2
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies. Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
```sh ```sh
pacman -S git \ pacman -S git \
mingw-w64-ucrt-x86_64-gcc \ mingw-w64-ucrt-x86_64-gcc \
mingw-w64-ucrt-x86_64-cmake \ mingw-w64-ucrt-x86_64-cmake \
mingw-w64-ucrt-x86_64-vulkan-devel \ mingw-w64-ucrt-x86_64-vulkan-devel \
mingw-w64-ucrt-x86_64-shaderc mingw-w64-ucrt-x86_64-shaderc
``` ```
Switch into `llama.cpp` directory and build using CMake.
Switch into the `llama.cpp` directory and build using CMake.
```sh ```sh
cmake -B build -DGGML_VULKAN=ON cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release cmake --build build --config Release
@ -360,7 +323,7 @@ cmake --build build --config Release
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
``` ```
### CANN ## CANN
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU. This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/). For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
@ -375,22 +338,26 @@ cmake --build build --config release
You can test with: You can test with:
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
```bash ```bash
llm_load_tensors: CANN buffer size = 13313.00 MiB ./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32
```
If the following info is output on screen, you are using `llama.cpp` with the CANN backend:
```bash
llm_load_tensors: CANN model buffer size = 13313.00 MiB
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
``` ```
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md). For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
### Android ## Android
To read documentation for how to build on Android, [click here](./android.md) To read documentation for how to build on Android, [click here](./android.md)
### Arm CPU optimized mulmat kernels ## Notes about GPU-accelerated backends
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`). In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.

317
docs/cuda-fedora.md Normal file
View file

@ -0,0 +1,317 @@
# Setting Up CUDA on Fedora
In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for:
- [Fedora Workstation](https://fedoraproject.org/workstation/)
- [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/)
- [Fedora Spins](https://fedoraproject.org/spins)
- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.`, `Arch Linux`, and `Ubuntu`.
## Table of Contents
- [Prerequisites](#prerequisites)
- [Monitoring NVIDIA CUDA Repositories](#monitoring-nvidia-cuda-repositories)
- [Using the Fedora 39 CUDA Repository](#using-the-fedora-39-cuda-repository)
- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
- [Installing Essential Development Tools](#installing-essential-development-tools)
- [Adding the CUDA Repository](#adding-the-cuda-repository)
- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
- [Configuring the Environment](#configuring-the-environment)
- [Verifying the Installation](#verifying-the-installation)
- [Conclusion](#conclusion)
- [Troubleshooting](#troubleshooting)
- [Additional Notes](#additional-notes)
- [References](#references)
## Prerequisites
- **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/).
- **NVIDIA Drivers and Graphics Card installed on Host System (optional)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
- **Internet connectivity** to download packages.
### Monitoring NVIDIA CUDA Repositories
Before proceeding, it is advisable to check if NVIDIA has updated their CUDA repositories for your Fedora version. NVIDIA's repositories can be found at:
- [Fedora 40 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora40/x86_64/)
- [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/)
As of the latest update, these repositories do not contain the `cuda` meta-package or are missing essential components.
### Using the Fedora 39 CUDA Repository
Since the newer repositories are incomplete, we'll use the Fedora 39 repository:
- [Fedora 39 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/)
**Note:** Fedora 39 is no longer maintained, so we recommend using a toolbox environment to prevent system conflicts.
## Creating a Fedora Toolbox Environment
This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using a Fedora 39 toolbox allows us to install the necessary packages without affecting the host system.
**Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker.
We do not recommend installing on the host system, as Fedora 39 is out-of-maintenance, and instead you should upgrade to a maintained version of Fedora for your host.
1. **Create a Fedora 39 Toolbox:**
```bash
toolbox create --image registry.fedoraproject.org/fedora-toolbox:39 --container fedora-toolbox-39-cuda
```
2. **Enter the Toolbox:**
```bash
toolbox enter --container fedora-toolbox-39-cuda
```
Inside the toolbox, you have root privileges and can install packages without affecting the host system.
## Installing Essential Development Tools
1. **Synchronize the DNF Package Manager:**
```bash
sudo dnf distro-sync
```
2. **Install the Default Text Editor (Optional):**
```bash
sudo dnf install vim-default-editor --allowerasing
```
The `--allowerasing` flag resolves any package conflicts.
3. **Install Development Tools and Libraries:**
```bash
sudo dnf install @c-development @development-tools cmake
```
This installs essential packages for compiling software, including `gcc`, `make`, and other development headers.
## Adding the CUDA Repository
Add the NVIDIA CUDA repository to your DNF configuration:
```bash
sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/cuda-fedora39.repo
```
After adding the repository, synchronize the package manager again:
```bash
sudo dnf distro-sync
```
## Installing `nvidia-driver-libs`
Attempt to install `nvidia-driver-libs`:
```bash
sudo dnf install nvidia-driver-libs
```
**Explanation:**
- `nvidia-driver-libs` contains necessary NVIDIA driver libraries required by CUDA.
- This step might fail due to conflicts with existing NVIDIA drivers on the host system.
## Manually Resolving Package Conflicts
If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
### 1. Download the `nvidia-driver-libs` RPM
```bash
sudo dnf download --arch x86_64 nvidia-driver-libs
```
You should see a file similar to:
```
nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
```
### 2. Attempt to Install the RPM
```bash
sudo dnf install nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
```
**Expected Error:**
Installation may fail with errors pointing to conflicts with `egl-gbm` and `egl-wayland`.
**Note: It is important to carefully read the error messages to identify the exact paths that need to be excluded.**
### 3. Download Dependencies
```bash
sudo dnf download --arch x86_64 egl-gbm egl-wayland
```
### 4. Install `egl-gbm` with Excluded Paths
Exclude conflicting files during installation:
```bash
sudo rpm --install --verbose --hash \
--excludepath=/usr/lib64/libnvidia-egl-gbm.so.1.1.2 \
--excludepath=/usr/share/egl/egl_external_platform.d/15_nvidia_gbm.json \
egl-gbm-1.1.2^20240919gitb24587d-3.fc39.x86_64.rpm
```
**Explanation:**
- The `--excludepath` option skips installing files that conflict with existing files.
- Adjust the paths based on the error messages you receive.
### 5. Install `egl-wayland` with Excluded Paths
```bash
sudo rpm --install --verbose --hash \
--excludepath=/usr/share/egl/egl_external_platform.d/10_nvidia_wayland.json \
egl-wayland-1.1.17^20241118giteeb29e1-5.fc39.x86_64.rpm
```
### 6. Install `nvidia-driver-libs` with Excluded Paths
```bash
sudo rpm --install --verbose --hash \
--excludepath=/usr/share/glvnd/egl_vendor.d/10_nvidia.json \
--excludepath=/usr/share/nvidia/nvoptix.bin \
nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
```
**Note:**
- Replace the paths with the ones causing conflicts in your installation if they differ.
- The `--verbose` and `--hash` options provide detailed output during installation.
## Finalizing the Installation of `nvidia-driver-libs`
After manually installing the dependencies, run:
```bash
sudo dnf install nvidia-driver-libs
```
You should receive a message indicating the package is already installed:
```
Package nvidia-driver-libs-3:560.35.05-1.fc39.x86_64 is already installed.
Dependencies resolved.
Nothing to do.
Complete!
```
## Installing the CUDA Meta-Package
Now that the driver libraries are installed, proceed to install CUDA:
```bash
sudo dnf install cuda
```
This installs the CUDA toolkit and associated packages.
## Configuring the Environment
To use CUDA, add its binary directory to your system's `PATH`.
1. **Create a Profile Script:**
```bash
sudo sh -c 'echo "export PATH=\$PATH:/usr/local/cuda/bin" >> /etc/profile.d/cuda.sh'
```
**Explanation:**
- We add to `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system.
- The backslash `\` before `$PATH` ensures the variable is correctly written into the script.
2. **Make the Script Executable:**
```bash
sudo chmod +x /etc/profile.d/cuda.sh
```
3. **Source the Script to Update Your Environment:**
```bash
source /etc/profile.d/cuda.sh
```
**Note:** This command updates your current shell session with the new `PATH`. The `/etc/profile.d/cuda.sh` script ensures that the CUDA binaries are available in your `PATH` for all future sessions.
## Verifying the Installation
To confirm that CUDA is correctly installed and configured, check the version of the NVIDIA CUDA Compiler (`nvcc`):
```bash
nvcc --version
```
You should see output similar to:
```
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Tue_Oct_29_23:50:19_PDT_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0
```
This output confirms that the CUDA compiler is accessible and indicates the installed version.
## Conclusion
You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 39 CUDA repository. By manually resolving package conflicts and configuring the environment, you can develop CUDA applications without affecting your host system.
## Troubleshooting
- **Installation Failures:**
- If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
- Use the `--excludepath` option with `rpm` to exclude conflicting files during manual installations.
- **Driver Conflicts:**
- Since the host system may already have NVIDIA drivers installed, conflicts can arise. Using the toolbox environment helps isolate these issues.
- **Environment Variables Not Set:**
- If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`.
- Run `echo $PATH` to check if the path is included.
- Re-source the profile script or open a new terminal session.
## Additional Notes
- **Updating CUDA in the Future:**
- Keep an eye on the official NVIDIA repositories for updates to your Fedora version.
- When an updated repository becomes available, adjust your `dnf` configuration accordingly.
- **Building `llama.cpp`:**
- With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
- Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.
- **Using the Toolbox Environment:**
- The toolbox environment is isolated from your host system, which helps prevent conflicts.
- Remember that system files and configurations inside the toolbox are separate from the host. By default the home directory of the user is shared between the host and the toolbox.
---
**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox.
**Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide.
## References
- [Fedora Toolbox Documentation](https://docs.fedoraproject.org/en-US/fedora-silverblue/toolbox/)
- [NVIDIA CUDA Installation Guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
- [Podman Documentation](https://podman.io/get-started)
---

View file

@ -28,7 +28,7 @@ The required steps to implement for an HF model are:
```python ```python
@Model.register("MyModelForCausalLM") @Model.register("MyModelForCausalLM")
class MyModel(Model): class MyModel(Model):
model_arch = gguf.MODEL_ARCH.GROK model_arch = gguf.MODEL_ARCH.MYMODEL
``` ```
2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py) 2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
@ -79,14 +79,14 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi
- `Model#set_vocab` - `Model#set_vocab`
- `Model#write_tensors` - `Model#write_tensors`
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights. NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.
### 2. Define the model architecture in `llama.cpp` ### 2. Define the model architecture in `llama.cpp`
The model params and tensors layout must be defined in `llama.cpp`: The model params and tensors layout must be defined in `llama.cpp`:
1. Define a new `llm_arch` 1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES` 2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non standard metadata in `llm_load_hparams` 3. Add any non-standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors` 4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type` 5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`. Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/). Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).

View file

@ -6,20 +6,26 @@ find_package(Threads REQUIRED)
# ... # ...
# flags
llama_add_compile_flags()
# examples # examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN) if (EMSCRIPTEN)
else() else()
add_subdirectory(cvector-generator)
add_subdirectory(batched-bench) add_subdirectory(batched-bench)
add_subdirectory(batched) add_subdirectory(batched)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(eval-callback) add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(gbnf-validator) if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(gbnf-validator)
endif()
add_subdirectory(gguf-hash) add_subdirectory(gguf-hash)
add_subdirectory(gguf-split) add_subdirectory(gguf-split)
add_subdirectory(gguf) add_subdirectory(gguf)
@ -27,28 +33,41 @@ else()
add_subdirectory(imatrix) add_subdirectory(imatrix)
add_subdirectory(infill) add_subdirectory(infill)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(lookahead) add_subdirectory(lookahead)
add_subdirectory(lookup) add_subdirectory(lookup)
add_subdirectory(main) add_subdirectory(main)
add_subdirectory(parallel) add_subdirectory(parallel)
add_subdirectory(passkey) add_subdirectory(passkey)
add_subdirectory(perplexity) add_subdirectory(perplexity)
add_subdirectory(quantize-stats)
add_subdirectory(quantize) add_subdirectory(quantize)
add_subdirectory(retrieval) add_subdirectory(retrieval)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (LLAMA_BUILD_SERVER) if (LLAMA_BUILD_SERVER)
add_subdirectory(server) add_subdirectory(server)
endif()
if (GGML_SYCL)
add_subdirectory(sycl)
endif() endif()
add_subdirectory(save-load-state) add_subdirectory(save-load-state)
add_subdirectory(run)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(simple-chat) add_subdirectory(simple-chat)
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(speculative-simple)
add_subdirectory(tokenize) add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator)
add_subdirectory(export-lora)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(quantize-stats)
endif()
add_subdirectory(llava)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (GGML_SYCL)
add_subdirectory(sycl)
endif()
endif()
endif() endif()

View file

@ -1,61 +0,0 @@
#!/bin/bash
#
# Few-shot translation example.
# Requires a base model (i.e. no fine-tuned or instruct models).
#
# Usage:
#
# cd llama.cpp
# make -j
#
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
#
if [ $# -lt 2 ]; then
echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
exit 1
fi
eargs=""
if [ $# -gt 2 ]; then
eargs="${@:3}"
fi
ftmp="__llama.cpp_example_tmp__.txt"
trap "rm -f $ftmp" EXIT
echo "Translate from English to French:
===
sea otter, peppermint, plush girafe:
sea otter => loutre de mer
peppermint => menthe poivrée
plush girafe => girafe peluche
===
violin
violin => violon
===
phone, computer, mouse, keyboard:
phone => téléphone
computer => ordinateur
mouse => souris
keyboard => clavier
===
" > $ftmp
echo "$2
" >> $ftmp
model=$1
# generate the most likely continuation until the string "===" is found
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View file

@ -2,4 +2,4 @@ set(TARGET llama-batched-bench)
add_executable(${TARGET} batched-bench.cpp) add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View file

@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params); llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@ -50,7 +50,7 @@ int main(int argc, char ** argv) {
// ensure enough sequences are available // ensure enough sequences are available
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
llama_batch_free(batch); llama_batch_free(batch);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_model_free(model);
llama_backend_free(); llama_backend_free();

View file

@ -23,12 +23,12 @@ defer {
} }
let model_params = llama_model_default_params() let model_params = llama_model_default_params()
guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else { guard let model = llama_model_load_from_file(modelPath.cString(using: .utf8), model_params) else {
print("Failed to load model") print("Failed to load model")
exit(1) exit(1)
} }
defer { defer {
llama_free_model(model) llama_model_free(model)
} }
var tokens = tokenize(text: prompt, add_bos: true) var tokens = tokenize(text: prompt, add_bos: true)
@ -141,7 +141,7 @@ while n_cur <= n_len {
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
// is it an end of stream? -> mark the stream as finished // is it an end of stream? -> mark the stream as finished
if llama_token_is_eog(model, new_token_id) || n_cur == n_len { if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
i_batch[i] = -1 i_batch[i] = -1
// print("") // print("")
if n_parallel > 1 { if n_parallel > 1 {

View file

@ -2,4 +2,4 @@ set(TARGET llama-batched)
add_executable(${TARGET} batched.cpp) add_executable(${TARGET} batched.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View file

@ -41,17 +41,19 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params); llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__); LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = common_tokenize(model, params.prompt, true); tokens_list = common_tokenize(vocab, params.prompt, true);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
@ -62,16 +64,17 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = n_kv_req; ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_predict, n_parallel); ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams); llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k)); llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep)); llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp)); llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
if (ctx == NULL) { if (ctx == NULL) {
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
@ -119,8 +122,8 @@ int main(int argc, char ** argv) {
} }
llama_token decoder_start_token_id = llama_model_decoder_start_token(model); llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) { if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = llama_token_bos(model); decoder_start_token_id = llama_vocab_bos(vocab);
} }
common_batch_clear(batch); common_batch_clear(batch);
@ -173,7 +176,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
// is it an end of generation? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1; i_batch[i] = -1;
LOG("\n"); LOG("\n");
if (n_parallel > 1) { if (n_parallel > 1) {
@ -235,7 +238,7 @@ int main(int argc, char ** argv) {
llama_sampler_free(smpl); llama_sampler_free(smpl);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_model_free(model);
llama_backend_free(); llama_backend_free();

View file

@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt" NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin" NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+' SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' '|'\
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d" SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
CTX_SIZE=2048 CTX_SIZE=2048
@ -129,15 +130,12 @@ while read -e line; do
printf ' ' printf ' '
# HACK get num tokens from debug message if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./llama-cli output!" echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
exit 1 exit 1
fi fi
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg"))) n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
if ((n_tokens > CTX_ROTATE_POINT)); then if ((n_tokens > CTX_ROTATE_POINT)); then
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE" tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"

View file

@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp) add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View file

@ -2,11 +2,8 @@
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default. This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository: To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
`$ make -j`
After successful compilation, following usage options are available:
``` ```
usage: ./llama-convert-llama2c-to-ggml [options] usage: ./llama-convert-llama2c-to-ggml [options]

View file

@ -1,4 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "gguf.h"
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
@ -434,12 +436,12 @@ static void print_matrix(struct ggml_tensor * probs) {
} }
} }
struct llama_file { struct my_llama_file {
// use FILE * so we don't have to re-open the file to mmap // use FILE * so we don't have to re-open the file to mmap
FILE * fp; FILE * fp;
size_t size; size_t size;
llama_file(const char * fname, const char * mode) { my_llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode); fp = std::fopen(fname, mode);
if (fp == NULL) { if (fp == NULL) {
size = 0; size = 0;
@ -500,7 +502,7 @@ struct llama_file {
return std::string(chars.data(), len); return std::string(chars.data(), len);
} }
~llama_file() { ~my_llama_file() {
if (fp) { if (fp) {
std::fclose(fp); std::fclose(fp);
} }
@ -508,7 +510,7 @@ struct llama_file {
}; };
static bool is_ggml_file(const char * filename) { static bool is_ggml_file(const char * filename) {
llama_file file(filename, "rb"); my_llama_file file(filename, "rb");
if (file.size < 4) { if (file.size < 4) {
return false; return false;
} }
@ -576,7 +578,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
} else { } else {
// assume llama2.c vocabulary // assume llama2.c vocabulary
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb"); my_llama_file file(filename, "rb");
if (!file.fp) { if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename); die_fmt("%s: %s", strerror(errno), filename);
} }
@ -689,8 +691,8 @@ static void save_as_llama_model(
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
@ -909,7 +911,7 @@ int main(int argc, char ** argv) {
load_vocab(params.fn_vocab_model, &config, &vocab); load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model; struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx; model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd; model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim; model.hparams.n_ff = config.hidden_dim;

View file

@ -840,6 +840,8 @@ class OutputFile:
self.gguf.add_base_model_version(key, base_model_entry["version"]) self.gguf.add_base_model_version(key, base_model_entry["version"])
if "organization" in base_model_entry: if "organization" in base_model_entry:
self.gguf.add_base_model_organization(key, base_model_entry["organization"]) self.gguf.add_base_model_organization(key, base_model_entry["organization"])
if "description" in base_model_entry:
self.gguf.add_base_model_description(key, base_model_entry["description"])
if "url" in base_model_entry: if "url" in base_model_entry:
self.gguf.add_base_model_url(key, base_model_entry["url"]) self.gguf.add_base_model_url(key, base_model_entry["url"])
if "doi" in base_model_entry: if "doi" in base_model_entry:
@ -849,12 +851,32 @@ class OutputFile:
if "repo_url" in base_model_entry: if "repo_url" in base_model_entry:
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"]) self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
if metadata.datasets is not None:
self.gguf.add_dataset_count(len(metadata.datasets))
for key, dataset_entry in enumerate(metadata.datasets):
if "name" in dataset_entry:
self.gguf.add_dataset_name(key, dataset_entry["name"])
if "author" in dataset_entry:
self.gguf.add_dataset_author(key, dataset_entry["author"])
if "version" in dataset_entry:
self.gguf.add_dataset_version(key, dataset_entry["version"])
if "organization" in dataset_entry:
self.gguf.add_dataset_organization(key, dataset_entry["organization"])
if "description" in dataset_entry:
self.gguf.add_dataset_description(key, dataset_entry["description"])
if "url" in dataset_entry:
self.gguf.add_dataset_url(key, dataset_entry["url"])
if "doi" in dataset_entry:
self.gguf.add_dataset_doi(key, dataset_entry["doi"])
if "uuid" in dataset_entry:
self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
if "repo_url" in dataset_entry:
self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
if metadata.tags is not None: if metadata.tags is not None:
self.gguf.add_tags(metadata.tags) self.gguf.add_tags(metadata.tags)
if metadata.languages is not None: if metadata.languages is not None:
self.gguf.add_languages(metadata.languages) self.gguf.add_languages(metadata.languages)
if metadata.datasets is not None:
self.gguf.add_datasets(metadata.datasets)
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
# Metadata About The Neural Architecture Itself # Metadata About The Neural Architecture Itself

View file

@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator)
add_executable(${TARGET} cvector-generator.cpp pca.hpp) add_executable(${TARGET} cvector-generator.cpp pca.hpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_17)

Some files were not shown because too many files have changed in this diff Show more