Compare commits

..

3 commits

Author SHA1 Message Date
Johannes Gäßler
a8046c888a use calloc instead of malloc 2024-12-04 17:24:35 +01:00
Johannes Gäßler
096b847a0f fix wrong type in print 2024-12-04 14:16:05 +01:00
Johannes Gäßler
b88727009d GGUF: backend support, fixed-width I/O, misc fixes 2024-12-04 13:16:03 +01:00
513 changed files with 30539 additions and 79925 deletions

View file

@ -1,92 +0,0 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH
ARG GGML_CPU_ARM_ARCH=armv8-a
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "$TARGETARCH" = "amd64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \
fi && \
cmake --build build -j $(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -1,94 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
WORKDIR /app
COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -0,0 +1,33 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \
cp build/bin/* .
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -0,0 +1,33 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \
cp build/bin/* .
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -0,0 +1,50 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"]

25
.devops/full.Dockerfile Normal file
View file

@ -0,0 +1,25 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
ENV LLAMA_CURL=1
RUN make -j$(nproc)
ENV LC_ALL=C.utf8
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -1,91 +0,0 @@
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
## Build Image
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
### Full
FROM base AS full
COPY --from=build /app/lib/ /app
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -0,0 +1,38 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git cmake
WORKDIR /app
COPY . .
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/lib/ /
COPY --from=build /app/build/bin/llama-cli /
ENTRYPOINT [ "/llama-cli" ]

View file

@ -0,0 +1,28 @@
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with static libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View file

@ -0,0 +1,38 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the MUSA runtime image
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git cmake
WORKDIR /app
COPY . .
# Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/lib/ /
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ]

View file

@ -0,0 +1,45 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make -j$(nproc) llama-cli
ENTRYPOINT [ "/app/llama-cli" ]

View file

@ -0,0 +1,27 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget libgomp1
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
cmake --build build --config Release --target llama-cli
# Clean up
WORKDIR /
RUN cp /app/build/bin/llama-cli /llama-cli && \
rm -rf /app
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View file

@ -0,0 +1,23 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
RUN make -j$(nproc) llama-cli
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View file

@ -0,0 +1,43 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/lib/ /
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -0,0 +1,34 @@
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -0,0 +1,43 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the MUSA runtime image
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
# Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc) && \
mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/lib/ /
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -0,0 +1,54 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102"
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl
RUN make -j$(nproc) llama-server
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -0,0 +1,31 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK and cURL
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release --target llama-server
# Clean up
WORKDIR /
RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -0,0 +1,41 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN \
# Build multiple versions of the CPU backend
scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \
scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \
scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
# Build llama-server
cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build build --target llama-server -j $(nproc) && \
# Copy the built libraries to /app/lib
mkdir -p /app/lib && \
mv libggml-cpu* /app/lib/ && \
find build -name "*.so" -exec cp {} /app/lib/ \;
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/bin/llama-server /llama-server
COPY --from=build /app/lib/ /
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -1,108 +0,0 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc3.1.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
ARG MUSA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y \
build-essential \
cmake \
python3 \
python3-pip \
git \
libcurl4-openssl-dev \
libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -31,7 +31,6 @@
# Increases the runtime closure size by ~700M # Increases the runtime closure size by ~700M
useMpi ? false, useMpi ? false,
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true, enableCurl ? true,
useVulkan ? false, useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@ -189,7 +188,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
] ]
++ optionals useRocm [ ++ optionals useRocm [
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets) (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
] ]
++ optionals useMetalKit [ ++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")

View file

@ -1,113 +0,0 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=6.3
ARG AMDGPU_VERSION=6.3
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
### Build image
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
# gfx906 is deprecated
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
ARG ROCM_DOCKER_ARCH=gfx1100
# Set nvcc architectured
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
# ENV CC=/opt/rocm/llvm/bin/clang
# ENV CXX=/opt/rocm/llvm/bin/clang++
RUN apt-get update \
&& apt-get install -y \
build-essential \
cmake \
git \
libcurl4-openssl-dev \
curl \
libgomp1
WORKDIR /app
COPY . .
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
&& cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib \
&& find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3-pip \
python3 \
python3-wheel\
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -8,36 +8,28 @@ arg1="$1"
shift shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
exec python3 ./convert_hf_to_gguf.py "$@" python3 ./convert_hf_to_gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
exec ./llama-quantize "$@" ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
exec ./llama-cli "$@" ./llama-cli "$@"
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
exec ./llama-bench "$@"
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
exec ./llama-perplexity "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in $(ls $1/$2/ggml-model-f16.bin*); do for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then if [ -f "${i/f16/q4_0}" ]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}" echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0 ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
fi fi
done done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
exec ./llama-server "$@" ./llama-server "$@"
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml" echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
echo " ex: -m model.gguf"
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
echo " ex: -m model.gguf -f file.txt"
echo " --convert (-c): Convert a llama model into ggml" echo " --convert (-c): Convert a llama model into ggml"
echo " ex: --outtype f16 \"/models/7B/\" " echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml" echo " --quantize (-q): Optimize with quantization process ggml"

View file

@ -1,89 +0,0 @@
ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK and cURL
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt update -y && \
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl libvulkan-dev \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app
### Full
FROM base AS full
COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -40,11 +40,3 @@ indent_style = tab
[examples/cvector-generator/*.txt] [examples/cvector-generator/*.txt]
trim_trailing_whitespace = unset trim_trailing_whitespace = unset
insert_final_newline = unset insert_final_newline = unset
[models/templates/*.jinja]
indent_style = unset
indent_size = unset
end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset

View file

@ -65,22 +65,12 @@ body:
If possible, please do a git bisect and identify the exact commit that introduced the bug. If possible, please do a git bisect and identify the exact commit that introduced the bug.
validations: validations:
required: false required: false
- type: textarea
id: command
attributes:
label: Compile command
description: >
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: true
- type: textarea - type: textarea
id: logs id: logs
attributes: attributes:
label: Relevant log output label: Relevant log output
description: > description: >
Please copy and paste any relevant log output, including any generated text. Please copy and paste any relevant log output, including the command that you entered and any generated text.
This will be automatically formatted into code, so no need for backticks. This will be automatically formatted into code, so no need for backticks.
render: shell render: shell
validations: validations:

View file

@ -52,16 +52,6 @@ body:
- Other (Please specify in the next section) - Other (Please specify in the next section)
validations: validations:
required: false required: false
- type: textarea
id: command
attributes:
label: Command line
description: >
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
This will be automatically formatted into code, so no need for backticks.
render: shell
validations:
required: false
- type: textarea - type: textarea
id: info id: info
attributes: attributes:
@ -84,7 +74,7 @@ body:
attributes: attributes:
label: Relevant log output label: Relevant log output
description: > description: >
If applicable, please copy and paste any relevant log output, including any generated text. If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
This will be automatically formatted into code, so no need for backticks. This will be automatically formatted into code, so no need for backticks.
render: shell render: shell
validations: validations:

View file

@ -10,10 +10,10 @@ on:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -43,12 +43,6 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
continue-on-error: true continue-on-error: true
@ -59,14 +53,16 @@ jobs:
id: cmake_build id: cmake_build
run: | run: |
sysctl -a sysctl -a
cmake -B build \ mkdir build
-DCMAKE_BUILD_RPATH="@loader_path" \ cd build
cmake .. \
-DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \ -DLLAMA_CURL=ON \
-DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON -DGGML_RPC=ON \
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -DBUILD_SHARED_LIBS=OFF
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
id: cmake_test id: cmake_test
@ -92,7 +88,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
cp LICENSE ./build/bin/ cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
@ -112,12 +107,6 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-x64
evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
continue-on-error: true continue-on-error: true
@ -131,11 +120,11 @@ jobs:
# Metal is disabled due to intermittent failures with Github runners not having a GPU: # Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \ cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \ -DLLAMA_CURL=ON \
-DGGML_METAL=OFF \ -DGGML_METAL=OFF \
-DGGML_RPC=ON -DGGML_RPC=ON \
-DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -162,7 +151,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
cp LICENSE ./build/bin/ cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
@ -172,8 +160,8 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip name: llama-bin-macos-x64.zip
ubuntu-cpu-cmake: ubuntu-latest-cmake:
runs-on: ubuntu-22.04 runs-on: ubuntu-latest
steps: steps:
- name: Clone - name: Clone
@ -182,12 +170,6 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-cpu-cmake
evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
@ -197,11 +179,10 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
cmake -B build \ mkdir build
-DLLAMA_FATAL_WARNINGS=ON \ cd build
-DLLAMA_CURL=ON \ cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
-DGGML_RPC=ON cmake --build . --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
- name: Test - name: Test
id: cmake_test id: cmake_test
@ -238,7 +219,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
cp LICENSE ./build/bin/ cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
@ -256,19 +236,13 @@ jobs:
strategy: strategy:
matrix: matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED] sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug] build_type: [Debug, Release]
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
@ -279,52 +253,19 @@ jobs:
id: cmake_build id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }} if: ${{ matrix.sanitizer != 'THREAD' }}
run: | run: |
cmake -B build \ mkdir build
-DLLAMA_FATAL_WARNINGS=ON \ cd build
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP) - name: Build (no OpenMP)
id: cmake_build_no_openmp id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }} if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-latest-llguidance:
runs-on: ubuntu-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. \ cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
-DLLAMA_FATAL_WARNINGS=ON \ cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-DLLAMA_LLGUIDANCE=ON
cmake --build . --config Release -j $(nproc)
- name: Test - name: Test
id: cmake_test id: cmake_test
@ -342,12 +283,6 @@ jobs:
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-latest-cmake-rpc
evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
@ -357,9 +292,10 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
cmake -B build \ mkdir build
-DGGML_RPC=ON cd build
cmake --build build --config Release -j $(nproc) cmake -DGGML_RPC=ON ..
cmake --build . --config Release -j $(nproc)
- name: Test - name: Test
id: cmake_test id: cmake_test
@ -375,33 +311,21 @@ jobs:
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-vulkan
evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk sudo apt-get install -y build-essential vulkan-sdk
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
cmake -B build \ mkdir build
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build cd build
# This is using llvmpipe and runs slower than other backends cmake -DGGML_VULKAN=ON ..
ctest -L main --verbose --timeout 1800 cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-hip: ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
@ -418,27 +342,16 @@ jobs:
sudo apt-get update sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-hip
evict-old-files: 1d
- name: Build with native CMake HIP support - name: Build with native CMake HIP support
id: cmake_build id: cmake_build
run: | run: |
cmake -B build -S . \ cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP=ON
cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support - name: Build with legacy HIP support
id: cmake_build_legacy_hip id: cmake_build_legacy_hip
run: | run: |
cmake -B build2 -S . \ cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
-DCMAKE_C_COMPILER=hipcc \
-DCMAKE_CXX_COMPILER=hipcc \
-DGGML_HIP=ON
cmake --build build2 --config Release -j $(nproc) cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-musa: ubuntu-22-cmake-musa:
@ -456,17 +369,10 @@ jobs:
apt-get update apt-get update
apt-get install -y build-essential git cmake libcurl4-openssl-dev apt-get install -y build-essential git cmake libcurl4-openssl-dev
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-musa
evict-old-files: 1d
- name: Build with native CMake MUSA support - name: Build with native CMake MUSA support
id: cmake_build id: cmake_build
run: | run: |
cmake -B build -S . \ cmake -B build -S . -DGGML_MUSA=ON
-DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc)
ubuntu-22-cmake-sycl: ubuntu-22-cmake-sycl:
@ -501,21 +407,14 @@ jobs:
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-sycl
evict-old-files: 1d
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
cmake -B build \ mkdir build
-DGGML_SYCL=ON \ cd build
-DCMAKE_C_COMPILER=icx \ cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
-DCMAKE_CXX_COMPILER=icpx cmake --build . --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
ubuntu-22-cmake-sycl-fp16: ubuntu-22-cmake-sycl-fp16:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
@ -549,22 +448,47 @@ jobs:
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-sycl-fp16
evict-old-files: 1d
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
cmake -B build \ mkdir build
-DGGML_SYCL=ON \ cd build
-DCMAKE_C_COMPILER=icx \ cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
-DCMAKE_CXX_COMPILER=icpx \ cmake --build . --config Release -j $(nproc)
-DGGML_SYCL_F16=ON
cmake --build build --config Release -j $(nproc) # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
# would be great if we fix these
macOS-latest-cmake:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
macOS-latest-cmake-ios: macOS-latest-cmake-ios:
runs-on: macos-latest runs-on: macos-latest
@ -574,12 +498,6 @@ jobs:
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-ios
evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
continue-on-error: true continue-on-error: true
@ -590,7 +508,9 @@ jobs:
id: cmake_build id: cmake_build
run: | run: |
sysctl -a sysctl -a
cmake -B build -G Xcode \ mkdir build
cd build
cmake -G Xcode .. \
-DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \
@ -599,7 +519,7 @@ jobs:
-DCMAKE_SYSTEM_NAME=iOS \ -DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-cmake-tvos: macOS-latest-cmake-tvos:
runs-on: macos-latest runs-on: macos-latest
@ -609,12 +529,6 @@ jobs:
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-tvos
evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
continue-on-error: true continue-on-error: true
@ -625,7 +539,9 @@ jobs:
id: cmake_build id: cmake_build
run: | run: |
sysctl -a sysctl -a
cmake -B build -G Xcode \ mkdir build
cd build
cmake -G Xcode .. \
-DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \
@ -634,50 +550,37 @@ jobs:
-DCMAKE_SYSTEM_NAME=tvOS \ -DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
macOS-latest-swift: # TODO: tmp disabled. see for possible re-enable:
runs-on: macos-latest # https://github.com/ggerganov/llama.cpp/pull/10525
# macOS-latest-swift:
strategy: # runs-on: macos-latest
matrix: #
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] # strategy:
# matrix:
steps: # destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
- name: Clone #
id: checkout # steps:
uses: actions/checkout@v4 # - name: Clone
# id: checkout
- name: ccache # uses: actions/checkout@v4
uses: hendrikmuhs/ccache-action@v1.2.16 #
with: # - name: Dependencies
key: macOS-latest-swift # id: depends
evict-old-files: 1d # continue-on-error: true
# run: |
- name: Dependencies # brew update
id: depends #
continue-on-error: true # - name: xcodebuild for swift package
run: | # id: xcodebuild
brew update # run: |
# xcodebuild -scheme llama -destination "${{ matrix.destination }}"
- name: Build llama.cpp with CMake #
id: cmake_build # - name: Build Swift Example
run: | # id: make_build_swift_example
sysctl -a # run: |
cmake -B build -G Xcode \ # make swift
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
sudo cmake --install build --config Release
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
windows-msys2: windows-msys2:
runs-on: windows-latest runs-on: windows-latest
@ -693,13 +596,6 @@ jobs:
- name: Clone - name: Clone
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-msys2
variant: sccache
evict-old-files: 1d
- name: Setup ${{ matrix.sys }} - name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2 uses: msys2/setup-msys2@v2
with: with:
@ -707,7 +603,6 @@ jobs:
msystem: ${{matrix.sys}} msystem: ${{matrix.sys}}
install: >- install: >-
base-devel base-devel
git
mingw-w64-${{matrix.env}}-toolchain mingw-w64-${{matrix.env}}-toolchain
mingw-w64-${{matrix.env}}-cmake mingw-w64-${{matrix.env}}-cmake
mingw-w64-${{matrix.env}}-openblas mingw-w64-${{matrix.env}}-openblas
@ -741,25 +636,23 @@ jobs:
matrix: matrix:
include: include:
- build: 'noavx-x64' - build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2-x64' - build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx-x64' - build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64' - build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'openblas-x64' - build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64' - build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan-x64' - build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON' defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64' - build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64' - build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
steps: steps:
- name: Clone - name: Clone
@ -768,13 +661,6 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-${{ matrix.build }}
variant: sccache
evict-old-files: 1d
- name: Clone Kompute submodule - name: Clone Kompute submodule
id: clone_kompute id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }} if: ${{ matrix.build == 'kompute-x64' }}
@ -808,26 +694,6 @@ jobs:
run: | run: |
choco install ninja choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
cmake -B build `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
cmake -B build-arm64-release `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
@ -857,7 +723,7 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
# not all machines have native AVX-512 # not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: | run: |
cd build cd build
ctest -L main -C Release --verbose --timeout 900 ctest -L main -C Release --verbose --timeout 900
@ -893,7 +759,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts - name: Upload artifacts
@ -911,8 +776,6 @@ jobs:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies - name: Install dependencies
env: env:
@ -921,21 +784,9 @@ jobs:
apt update apt update
apt install -y cmake build-essential ninja-build libgomp1 git apt install -y cmake build-essential ninja-build libgomp1 git
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-latest-cmake-cuda
evict-old-files: 1d
- name: Build with CMake - name: Build with CMake
run: | run: |
cmake -S . -B build -G Ninja \ cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES=89-real \
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_NATIVE=OFF \
-DGGML_CUDA=ON
cmake --build build cmake --build build
windows-2019-cmake-cuda: windows-2019-cmake-cuda:
@ -953,13 +804,6 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
variant: sccache
evict-old-files: 1d
- name: Install Cuda Toolkit 11.7 - name: Install Cuda Toolkit 11.7
if: ${{ matrix.cuda == '11.7' }} if: ${{ matrix.cuda == '11.7' }}
run: | run: |
@ -1016,6 +860,11 @@ jobs:
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
- name: Install Ninja - name: Install Ninja
id: install_ninja id: install_ninja
run: | run: |
@ -1026,11 +875,7 @@ jobs:
shell: cmd shell: cmd
run: | run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" ^ cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
-DLLAMA_BUILD_SERVER=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_CUDA=ON ^
-DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release cmake --build build --config Release
@ -1095,13 +940,6 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-sycl
variant: sccache
evict-old-files: 1d
- name: Install - name: Install
run: | run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
@ -1181,22 +1019,16 @@ jobs:
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Install ccache - name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2.16 uses: hendrikmuhs/ccache-action@v1.2
with: with:
key: ${{ github.job }} key: ${{ github.job }}
evict-old-files: 1d
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . ` cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_BUILD_TYPE=Release `
-DGGML_HIP=ON `
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS} cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
windows-latest-cmake-hip-release: windows-latest-cmake-hip-release:
@ -1214,12 +1046,6 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-release
evict-old-files: 1d
- name: Install - name: Install
id: depends id: depends
run: | run: |
@ -1240,13 +1066,7 @@ jobs:
run: | run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . ` cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_BUILD_TYPE=Release `
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-DGGML_HIP=ON `
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS} cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\" md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
@ -1284,27 +1104,6 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
sudo cmake --install build --config Release
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
- name: Build Xcode project - name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@ -1315,12 +1114,6 @@ jobs:
- name: Clone - name: Clone
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: android-build
evict-old-files: 1d
- name: Set up JDK - name: Set up JDK
uses: actions/setup-java@v3 uses: actions/setup-java@v3
with: with:
@ -1338,13 +1131,31 @@ jobs:
./gradlew build --no-daemon ./gradlew build --no-daemon
# freeBSD-latest:
# runs-on: macos-12
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Build
# uses: cross-platform-actions/action@v0.19.0
# with:
# operating_system: freebsd
# version: '13.2'
# hypervisor: 'qemu'
# run: |
# sudo pkg update
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: needs:
- ubuntu-cpu-cmake - ubuntu-latest-cmake
- macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-2019-cmake-cuda - windows-2019-cmake-cuda
- windows-latest-cmake-hip-release - windows-latest-cmake-hip-release
@ -1358,12 +1169,6 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: release
evict-old-files: 1d
- name: Determine tag name - name: Determine tag name
id: tag id: tag
shell: bash shell: bash
@ -1389,7 +1194,7 @@ jobs:
- name: Create release - name: Create release
id: create_release id: create_release
uses: ggml-org/action-create-release@v1 uses: anzz1/action-create-release@v1
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with: with:
@ -1609,37 +1414,3 @@ jobs:
# popd # popd
# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} # emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
# make # make
openEuler-latest-cmake-cann:
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
defaults:
run:
shell: bash -el {0}
runs-on: ubuntu-24.04-arm
strategy:
matrix:
cann:
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
device:
- 'ascend910b3'
build:
- 'Release'
container: ascendai/cann:${{ matrix.cann }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Dependencies
run: |
yum update -y
yum install -y git gcc gcc-c++ make cmake
- name: Build
run: |
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-DGGML_CANN=on \
-DSOC_TYPE=${{ matrix.device }}
cmake --build build -j $(nproc)

View file

@ -17,7 +17,7 @@ jobs:
steps: steps:
- uses: actions/stale@v5 - uses: actions/stale@v5
with: with:
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap" exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
days-before-issue-stale: 30 days-before-issue-stale: 30
days-before-issue-close: 14 days-before-issue-close: 14
stale-issue-label: "stale" stale-issue-label: "stale"

View file

@ -28,21 +28,27 @@ jobs:
push_to_registry: push_to_registry:
name: Push Docker image to Docker Hub name: Push Docker image to Docker Hub
runs-on: ubuntu-22.04 runs-on: ubuntu-latest
env: env:
COMMIT_SHA: ${{ github.sha }} COMMIT_SHA: ${{ github.sha }}
strategy: strategy:
fail-fast: false
matrix: matrix:
config: config:
# Multi-stage build - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true } #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v4 uses: actions/checkout@v4
@ -50,10 +56,10 @@ jobs:
fetch-depth: 0 # preserve git history, so we can determine the build number fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v3 uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx - name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 uses: docker/setup-buildx-action@v2
- name: Log in to Docker Hub - name: Log in to Docker Hub
uses: docker/login-action@v2 uses: docker/login-action@v2
@ -73,34 +79,26 @@ jobs:
# determine tag name postfix (build number, commit hash) # determine tag name postfix (build number, commit hash)
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
TAG_POSTFIX="-b${BUILD_NUMBER}" TAG_POSTFIX="b${BUILD_NUMBER}"
else else
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-') SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}" TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
fi fi
# list all tags possible # list all tags possible
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then TAGS=""
TYPE="" TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
else TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
TYPE="-${{ matrix.config.tag }}"
fi echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" echo "output_tags=$TAGS" # print out for debugging
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
echo "full_output_tags=$FULLTAGS" # print out for debugging
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
echo "server_output_tags=$SERVERTAGS" # print out for debugging
env: env:
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
- name: Free Disk Space (Ubuntu) - name: Free Disk Space (Ubuntu)
if: ${{ matrix.config.free_disk_space == true }} uses: jlumbroso/free-disk-space@main
uses: ggml-org/free-disk-space@v1.3.1
with: with:
# this might remove tools that are actually needed, # this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB # if set to "true" but frees about 6 GB
@ -115,59 +113,13 @@ jobs:
docker-images: true docker-images: true
swap-storage: true swap-storage: true
- name: Build and push Full Docker image (tagged + versioned) - name: Build and push Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
uses: docker/build-push-action@v6 uses: docker/build-push-action@v6
with: with:
context: . context: .
push: true push: true
platforms: ${{ matrix.config.platforms }} platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above # tag list is generated from step above
tags: ${{ steps.tag.outputs.full_output_tags }} tags: ${{ steps.tag.outputs.output_tags }}
file: ${{ matrix.config.dockerfile }} file: ${{ matrix.config.dockerfile }}
target: full
provenance: false
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
- name: Build and push Light Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.light_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: light
provenance: false
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
- name: Build and push Server Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.server_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: server
provenance: false
# using github experimental cache
cache-from: type=gha
cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache

View file

@ -23,7 +23,5 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: editorconfig-checker/action-editorconfig-checker@v2 - uses: editorconfig-checker/action-editorconfig-checker@main
with:
version: v3.0.3
- run: editorconfig-checker - run: editorconfig-checker

View file

@ -79,30 +79,7 @@ jobs:
# Setup nodejs (to be used for verifying bundled index.html) # Setup nodejs (to be used for verifying bundled index.html)
- uses: actions/setup-node@v4 - uses: actions/setup-node@v4
with: with:
node-version: '22.11.0' node-version: 22
- name: WebUI - Install dependencies
id: webui_lint
run: |
cd examples/server/webui
npm ci
- name: WebUI - Check code format
id: webui_format
run: |
git config --global --add safe.directory $(realpath .)
cd examples/server/webui
git status
npm run format
git status
modified_files="$(git status -s)"
echo "Modified files: ${modified_files}"
if [ -n "${modified_files}" ]; then
echo "Files do not follow coding style. To fix: npm run format"
echo "${modified_files}"
exit 1
fi
- name: Verify bundled index.html - name: Verify bundled index.html
id: verify_server_index_html id: verify_server_index_html
@ -110,7 +87,7 @@ jobs:
git config --global --add safe.directory $(realpath .) git config --global --add safe.directory $(realpath .)
cd examples/server/webui cd examples/server/webui
git status git status
npm ci
npm run build npm run build
git status git status
modified_files="$(git status -s)" modified_files="$(git status -s)"
@ -135,9 +112,9 @@ jobs:
-DGGML_OPENMP=OFF ; -DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers) - name: Build
id: cmake_build_sanitizers id: cmake_build
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }} if: ${{ matrix.sanitizer != 'THREAD' }}
run: | run: |
cmake -B build \ cmake -B build \
-DGGML_NATIVE=OFF \ -DGGML_NATIVE=OFF \
@ -147,31 +124,12 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
./tests.sh ./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd examples/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
@ -228,7 +186,7 @@ jobs:
run: | run: |
cd examples/server/tests cd examples/server/tests
$env:PYTHONIOENCODING = ":replace" $env:PYTHONIOENCODING = ":replace"
pytest -v -x -m "not slow" pytest -v -x
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow

1
.gitignore vendored
View file

@ -18,7 +18,6 @@
*.metallib *.metallib
*.o *.o
*.so *.so
*.swp
*.tmp *.tmp
# IDE / OS # IDE / OS

83
AUTHORS
View file

@ -1,4 +1,4 @@
# date: Tue Feb 4 13:04:05 EET 2025 # date: Thu Nov 28 20:46:15 EET 2024
# this file is auto-generated by scripts/gen-authors.sh # this file is auto-generated by scripts/gen-authors.sh
0cc4m <picard12@live.de> 0cc4m <picard12@live.de>
@ -20,8 +20,6 @@ Adithya Balaji <adithya.b94@gmail.com>
AdithyanI <adithyan.i4internet@gmail.com> AdithyanI <adithyan.i4internet@gmail.com>
Adrian <smith.adriane@gmail.com> Adrian <smith.adriane@gmail.com>
Adrian Hesketh <a-h@users.noreply.github.com> Adrian Hesketh <a-h@users.noreply.github.com>
Adrien Gallouët <adrien@gallouet.fr>
Adrien Gallouët <angt@huggingface.co>
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com> Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr> Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
@ -57,7 +55,6 @@ Ananta Bastola <anantarajbastola@gmail.com>
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
András Salamon <ott2@users.noreply.github.com> András Salamon <ott2@users.noreply.github.com>
Andreas (Andi) Kunar <andreask@msn.com> Andreas (Andi) Kunar <andreask@msn.com>
Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
Andrei <abetlen@gmail.com> Andrei <abetlen@gmail.com>
Andrew Canis <andrew.canis@gmail.com> Andrew Canis <andrew.canis@gmail.com>
Andrew Downing <andrew2085@gmail.com> Andrew Downing <andrew2085@gmail.com>
@ -94,17 +91,13 @@ Ben Siraphob <bensiraphob@gmail.com>
Ben Williams <ben@719ben.com> Ben Williams <ben@719ben.com>
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
Benson Wong <mostlygeek@gmail.com>
Bernat Vadell <hounter.caza@gmail.com> Bernat Vadell <hounter.caza@gmail.com>
Bernhard M. Wiedemann <githubbmwprimary@lsmod.de>
Bert Wagner <github@bertwagner.com> Bert Wagner <github@bertwagner.com>
Billel Mokeddem <billel.mokeddem.ml@gmail.com>
Bingan <70050083+binganao@users.noreply.github.com> Bingan <70050083+binganao@users.noreply.github.com>
Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com> Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
Bodo Graumann <mail@bodograumann.de> Bodo Graumann <mail@bodograumann.de>
Bono Lv <lvscar@users.noreply.github.com> Bono Lv <lvscar@users.noreply.github.com>
Borislav Stanimirov <b.stanimirov@abv.bg> Borislav Stanimirov <b.stanimirov@abv.bg>
Borislav Stanimirov <b@ibob.bg>
Branden Butler <bwtbutler@hotmail.com> Branden Butler <bwtbutler@hotmail.com>
Brandon Squizzato <35474886+bsquizz@users.noreply.github.com> Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
Brian <mofosyne@gmail.com> Brian <mofosyne@gmail.com>
@ -124,7 +117,6 @@ Casey Primozic <casey@cprimozic.net>
Casey Primozic <me@ameo.link> Casey Primozic <me@ameo.link>
CausalLM <148736309+CausalLM@users.noreply.github.com> CausalLM <148736309+CausalLM@users.noreply.github.com>
Cebtenzzre <cebtenzzre@gmail.com> Cebtenzzre <cebtenzzre@gmail.com>
CentricStorm <CentricStorm@users.noreply.github.com>
Chad Brewbaker <crb002@gmail.com> Chad Brewbaker <crb002@gmail.com>
Changyeon Kim <cyzero.kim@samsung.com> Changyeon Kim <cyzero.kim@samsung.com>
Chao Jiang <jc19chaoj@zoho.com> Chao Jiang <jc19chaoj@zoho.com>
@ -139,15 +131,12 @@ Chris Kuehl <ckuehl@ckuehl.me>
Christian Demsar <christian@github.email.demsar.us> Christian Demsar <christian@github.email.demsar.us>
Christian Demsar <crasm@git.vczf.us> Christian Demsar <crasm@git.vczf.us>
Christian Falch <875252+chrfalch@users.noreply.github.com> Christian Falch <875252+chrfalch@users.noreply.github.com>
Christian Kastner <ckk@kvr.at>
Christian Kögler <ck3d@gmx.de> Christian Kögler <ck3d@gmx.de>
Christian Köhnenkamp <cvk5@me.com> Christian Köhnenkamp <cvk5@me.com>
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
Clark Saben <76020733+csaben@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com>
Clint Herron <hanclinto@gmail.com> Clint Herron <hanclinto@gmail.com>
Conrad Kramer <conrad@conradkramer.com> Conrad Kramer <conrad@conradkramer.com>
Corentin REGAL <corentin.regal@gmail.com>
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
Csaba Kecskemeti <csaba.kecskemeti@gmail.com> Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
Cuong Trinh Manh <nguoithichkhampha@gmail.com> Cuong Trinh Manh <nguoithichkhampha@gmail.com>
@ -187,7 +176,6 @@ Dibakar Gope <dibakar.gope@arm.com>
Didzis Gosko <didzis@users.noreply.github.com> Didzis Gosko <didzis@users.noreply.github.com>
Diego Devesa <slarengh@gmail.com> Diego Devesa <slarengh@gmail.com>
Diogo Teles Sant'Anna <diogoteles@google.com> Diogo Teles Sant'Anna <diogoteles@google.com>
Djip007 <3705339+Djip007@users.noreply.github.com>
Djip007 <djip.perois@free.fr> Djip007 <djip.perois@free.fr>
Don Mahurin <dmahurin@users.noreply.github.com> Don Mahurin <dmahurin@users.noreply.github.com>
DooWoong Lee (David) <manics99@naver.com> DooWoong Lee (David) <manics99@naver.com>
@ -205,7 +193,6 @@ Edward Taylor <edeetee@gmail.com>
Elaine <elaine.zosa@gmail.com> Elaine <elaine.zosa@gmail.com>
Elbios <141279586+Elbios@users.noreply.github.com> Elbios <141279586+Elbios@users.noreply.github.com>
Elton Kola <eltonkola@gmail.com> Elton Kola <eltonkola@gmail.com>
Emreerdog <34742675+Emreerdog@users.noreply.github.com>
Engininja2 <139037756+Engininja2@users.noreply.github.com> Engininja2 <139037756+Engininja2@users.noreply.github.com>
Equim <sayaka@ekyu.moe> Equim <sayaka@ekyu.moe>
Eric Curtin <ecurtin@redhat.com> Eric Curtin <ecurtin@redhat.com>
@ -246,7 +233,6 @@ Fred Douglas <43351173+fredlas@users.noreply.github.com>
Frederik Vogel <Schaltfehler@users.noreply.github.com> Frederik Vogel <Schaltfehler@users.noreply.github.com>
Gabe Goodhart <gabe.l.hart@gmail.com> Gabe Goodhart <gabe.l.hart@gmail.com>
Gabe Goodhart <ghart@us.ibm.com> Gabe Goodhart <ghart@us.ibm.com>
Gaetan Bisson <gaetan@fenua.org>
GainLee <perfecter.gen@gmail.com> GainLee <perfecter.gen@gmail.com>
Galunid <karolek1231456@gmail.com> Galunid <karolek1231456@gmail.com>
Gary Linscott <glinscott@gmail.com> Gary Linscott <glinscott@gmail.com>
@ -263,7 +249,6 @@ Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
Guillaume Wenzek <gwenzek@users.noreply.github.com> Guillaume Wenzek <gwenzek@users.noreply.github.com>
Guoliang Hua <32868157+nbcsm@users.noreply.github.com> Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
Haggai Nuchi <h.nuchi@gmail.com> Haggai Nuchi <h.nuchi@gmail.com>
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
@ -274,13 +259,11 @@ Haoxiang Fei <tonyfettes@tonyfettes.com>
Harald Fernengel <harald.fernengel@here.com> Harald Fernengel <harald.fernengel@here.com>
Hatsune Miku <129688334+at8u@users.noreply.github.com> Hatsune Miku <129688334+at8u@users.noreply.github.com>
HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com> HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
Haus1 <haus.xda@gmail.com>
Henk Poley <HenkPoley@gmail.com> Henk Poley <HenkPoley@gmail.com>
Henri Vasserman <henv@hot.ee> Henri Vasserman <henv@hot.ee>
Henrik Forstén <henrik.forsten@gmail.com> Henrik Forstén <henrik.forsten@gmail.com>
Herman Semenov <GermanAizek@yandex.ru> Herman Semenov <GermanAizek@yandex.ru>
Hesen Peng <hesen.peng@gmail.com> Hesen Peng <hesen.peng@gmail.com>
HimariO <dsfhe49854@gmail.com>
Hoang Nguyen <hugo53@users.noreply.github.com> Hoang Nguyen <hugo53@users.noreply.github.com>
Hong Bo PENG <penghb@cn.ibm.com> Hong Bo PENG <penghb@cn.ibm.com>
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
@ -297,7 +280,6 @@ Icecream95 <the.real.icecream95@gmail.com>
Ido S <ido.pluto@gmail.com> Ido S <ido.pluto@gmail.com>
IgnacioFDM <ignaciofdm@gmail.com> IgnacioFDM <ignaciofdm@gmail.com>
Igor Okulist <okigan@gmail.com> Igor Okulist <okigan@gmail.com>
Ihar Hrachyshka <ihrachys@redhat.com>
Ikko Eltociear Ashimine <eltociear@gmail.com> Ikko Eltociear Ashimine <eltociear@gmail.com>
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
Ionoclast Laboratories <brigham@ionoclast.com> Ionoclast Laboratories <brigham@ionoclast.com>
@ -307,14 +289,12 @@ Ivan <nekotekina@gmail.com>
Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Ivan Komarov <Ivan.Komarov@dfyz.info> Ivan Komarov <Ivan.Komarov@dfyz.info>
Ivan Stepanov <ivanstepanovftw@gmail.com> Ivan Stepanov <ivanstepanovftw@gmail.com>
JFLFY2255 <JFLFY2255@163.com>
JH23X <165871467+JH23X@users.noreply.github.com> JH23X <165871467+JH23X@users.noreply.github.com>
Jack Mousseau <jack@software.inc> Jack Mousseau <jack@software.inc>
Jack Mousseau <jmousseau@users.noreply.github.com> Jack Mousseau <jmousseau@users.noreply.github.com>
JackJollimore <130917767+JackJollimore@users.noreply.github.com> JackJollimore <130917767+JackJollimore@users.noreply.github.com>
Jaeden Amero <jaeden@patater.com> Jaeden Amero <jaeden@patater.com>
Jaemin Son <woalsdnd@gmail.com> Jaemin Son <woalsdnd@gmail.com>
Jafar Uruç <jafar.uruc@gmail.com>
Jag Chadha <jagtesh@gmail.com> Jag Chadha <jagtesh@gmail.com>
Jakub N <jakubniemczyk97@gmail.com> Jakub N <jakubniemczyk97@gmail.com>
James A Capozzoli <157492257+jac-jim@users.noreply.github.com> James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
@ -335,7 +315,6 @@ Jeffrey Morgan <jmorganca@gmail.com>
Jeffrey Quesnelle <emozilla@nousresearch.com> Jeffrey Quesnelle <emozilla@nousresearch.com>
Jeroen Mostert <jeroen.mostert@cm.com> Jeroen Mostert <jeroen.mostert@cm.com>
Jesse Jojo Johnson <williamsaintgeorge@gmail.com> Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
Jett Janiak <jettjaniak@gmail.com>
Jeximo <jeximo@gmail.com> Jeximo <jeximo@gmail.com>
Jhen-Jie Hong <iainst0409@gmail.com> Jhen-Jie Hong <iainst0409@gmail.com>
Jiahao Li <liplus17@163.com> Jiahao Li <liplus17@163.com>
@ -364,7 +343,6 @@ Josh Ramer <josh.ramer@icloud.com>
Joyce <joycebrum@google.com> Joyce <joycebrum@google.com>
Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
Judd <foldl@users.noreply.github.com> Judd <foldl@users.noreply.github.com>
Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
Julius Arkenberg <arki05@users.noreply.github.com> Julius Arkenberg <arki05@users.noreply.github.com>
Jun Hee Yoo <contact.jhyoo@gmail.com> Jun Hee Yoo <contact.jhyoo@gmail.com>
Jun Jie <71215065+junnjiee16@users.noreply.github.com> Jun Jie <71215065+junnjiee16@users.noreply.github.com>
@ -379,7 +357,6 @@ Justine Tunney <jtunney@mozilla.com>
Juuso Alasuutari <juuso.alasuutari@gmail.com> Juuso Alasuutari <juuso.alasuutari@gmail.com>
KASR <karim.asrih@gmail.com> KASR <karim.asrih@gmail.com>
Kamil Tomšík <info@tomsik.cz> Kamil Tomšík <info@tomsik.cz>
Karol Kontny <82021046+kkontny@users.noreply.github.com>
Karsten Weiss <knweiss@gmail.com> Karsten Weiss <knweiss@gmail.com>
Karthick <j.karthic2004@gmail.com> Karthick <j.karthic2004@gmail.com>
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com> Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
@ -399,7 +376,6 @@ Kolen Cheung <ickc@users.noreply.github.com>
Konstantin Herud <konstantin.herud@denkbares.com> Konstantin Herud <konstantin.herud@denkbares.com>
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com> Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
Kunshang Ji <kunshang.ji@intel.com> Kunshang Ji <kunshang.ji@intel.com>
Kyle Bruene <KyleBruene@users.noreply.github.com>
Kyle Liang <liangmanlai@gmail.com> Kyle Liang <liangmanlai@gmail.com>
Kyle Mistele <kyle@mistele.com> Kyle Mistele <kyle@mistele.com>
Kylin <56434533+KyL0N@users.noreply.github.com> Kylin <56434533+KyL0N@users.noreply.github.com>
@ -418,7 +394,6 @@ Liu Jia <jia3.liu@intel.com>
LoganDark <github@logandark.mozmail.com> LoganDark <github@logandark.mozmail.com>
Loïc Carrère <loic.carrere@gmail.com> Loïc Carrère <loic.carrere@gmail.com>
LostRuins <39025047+LostRuins@users.noreply.github.com> LostRuins <39025047+LostRuins@users.noreply.github.com>
LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
Luciano <lucianostrika44@gmail.com> Luciano <lucianostrika44@gmail.com>
Luo Tian <lt@basecity.com> Luo Tian <lt@basecity.com>
Lyle Dean <dean@lyle.dev> Lyle Dean <dean@lyle.dev>
@ -448,7 +423,6 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com> Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
Matheus C. França <matheus-catarino@hotmail.com> Matheus C. França <matheus-catarino@hotmail.com>
Matheus Gabriel Alves Silva <matheusgasource@gmail.com> Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
Mathieu Baudier <mbaudier@argeo.org>
Mathieu Geli <mathieu.geli@gmail.com> Mathieu Geli <mathieu.geli@gmail.com>
Mathieu Nayrolles <MathieuNls@users.noreply.github.com> Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
Mathijs Henquet <mathijs.henquet@gmail.com> Mathijs Henquet <mathijs.henquet@gmail.com>
@ -470,7 +444,6 @@ Meng, Hengyu <hengyu.meng@intel.com>
Mengqing Cao <cmq0113@163.com> Mengqing Cao <cmq0113@163.com>
Merrick Christensen <merrick.christensen@gmail.com> Merrick Christensen <merrick.christensen@gmail.com>
Michael Coppola <m18coppola@gmail.com> Michael Coppola <m18coppola@gmail.com>
Michael Engel <mengel@redhat.com>
Michael Francis <edude03@gmail.com> Michael Francis <edude03@gmail.com>
Michael Hueschen <m@mhueschen.dev> Michael Hueschen <m@mhueschen.dev>
Michael Kesper <mkesper@schokokeks.org> Michael Kesper <mkesper@schokokeks.org>
@ -479,9 +452,7 @@ Michael Podvitskiy <podvitskiymichael@gmail.com>
Michael Potter <NanoTekGuy@Gmail.com> Michael Potter <NanoTekGuy@Gmail.com>
Michael de Gans <michael.john.degans@gmail.com> Michael de Gans <michael.john.degans@gmail.com>
Michaël de Vries <vriesdemichael@gmail.com> Michaël de Vries <vriesdemichael@gmail.com>
Michał Moskal <michal@moskal.me>
Michał Tuszyński <srgtuszy@gmail.com> Michał Tuszyński <srgtuszy@gmail.com>
Michelle Tan <41475767+MichelleTanPY@users.noreply.github.com>
Mihai <mihai.chirculescu@yahoo.com> Mihai <mihai.chirculescu@yahoo.com>
Mike <ytianhui2004@gmail.com> Mike <ytianhui2004@gmail.com>
Mikko Juola <mikjuo@gmail.com> Mikko Juola <mikjuo@gmail.com>
@ -506,7 +477,6 @@ Neo Zhang <14088817+arthw@users.noreply.github.com>
Neo Zhang <zhang.jianyu@outlook.com> Neo Zhang <zhang.jianyu@outlook.com>
Neo Zhang Jianyu <jianyu.zhang@intel.com> Neo Zhang Jianyu <jianyu.zhang@intel.com>
Neuman Vong <neuman.vong@gmail.com> Neuman Vong <neuman.vong@gmail.com>
NeverLucky <92274250+nvrxq@users.noreply.github.com>
Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
Nexesenex <124105151+Nexesenex@users.noreply.github.com> Nexesenex <124105151+Nexesenex@users.noreply.github.com>
Niall Coates <1349685+Niall-@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com>
@ -514,15 +484,11 @@ Nicholai Tukanov <nicholaitukanov@gmail.com>
Nico Bosshard <nico@bosshome.ch> Nico Bosshard <nico@bosshome.ch>
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de> Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
Nicolás Pérez <nicolas_perez@brown.edu> Nicolás Pérez <nicolas_perez@brown.edu>
Nicolò Scipione <nicolo.scipione@codeplay.com>
Nigel Bosch <pnigelb@gmail.com> Nigel Bosch <pnigelb@gmail.com>
Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Niklas Korz <niklas@niklaskorz.de> Niklas Korz <niklas@niklaskorz.de>
NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com> NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
Nikolaos Pothitos <pothitos@di.uoa.gr>
Nikolas <127742645+nneubacher@users.noreply.github.com> Nikolas <127742645+nneubacher@users.noreply.github.com>
Nindaleth <Nindaleth@users.noreply.github.com> Nindaleth <Nindaleth@users.noreply.github.com>
Nuno <rare-magma@posteo.eu>
OSecret <135510162+OLSecret@users.noreply.github.com> OSecret <135510162+OLSecret@users.noreply.github.com>
Oleksandr Nikitin <oleksandr@tvori.info> Oleksandr Nikitin <oleksandr@tvori.info>
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com> Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
@ -538,7 +504,6 @@ Pavel Zloi <github.com@drteam.rocks>
Pavol Rusnak <pavol@rusnak.io> Pavol Rusnak <pavol@rusnak.io>
Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com> Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
Pedro Cuenca <pedro@huggingface.co> Pedro Cuenca <pedro@huggingface.co>
Peter <peter277@users.noreply.github.com>
Peter Sugihara <peter@campsh.com> Peter Sugihara <peter@campsh.com>
Phil H <5756783+phiharri@users.noreply.github.com> Phil H <5756783+phiharri@users.noreply.github.com>
Philip Taron <philip.taron@gmail.com> Philip Taron <philip.taron@gmail.com>
@ -564,12 +529,9 @@ Rand Xie <randxiexyy29@gmail.com>
Randall Fitzgerald <randall@dasaku.net> Randall Fitzgerald <randall@dasaku.net>
Random Fly <renfei8@live.cn> Random Fly <renfei8@live.cn>
Reinforce-II <fate@eastal.com> Reinforce-II <fate@eastal.com>
Rémy Oudompheng <oudomphe@phare.normalesup.org>
Ren Xuancheng <jklj077@users.noreply.github.com> Ren Xuancheng <jklj077@users.noreply.github.com>
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
Reza Kakhki <rezakakhki.de@gmail.com>
RhinoDevel <RhinoDevel@users.noreply.github.com> RhinoDevel <RhinoDevel@users.noreply.github.com>
Riccardo Orlando <Riccorl@users.noreply.github.com>
Riceball LEE <snowyu.lee@gmail.com> Riceball LEE <snowyu.lee@gmail.com>
Rich Dougherty <rich@rd.nz> Rich Dougherty <rich@rd.nz>
Richard Kiss <him@richardkiss.com> Richard Kiss <him@richardkiss.com>
@ -582,8 +544,6 @@ Riley Stewart <ristew@users.noreply.github.com>
Rinne <AsakusaRinne@gmail.com> Rinne <AsakusaRinne@gmail.com>
Rinne <liu_yaohui1998@126.com> Rinne <liu_yaohui1998@126.com>
Robert Brisita <986796+rbrisita@users.noreply.github.com> Robert Brisita <986796+rbrisita@users.noreply.github.com>
Robert Collins <roberto.tomas.cuentas@gmail.com>
Robert Ormandi <52251610+ormandi@users.noreply.github.com>
Robert Sung-wook Shin <edp1096@users.noreply.github.com> Robert Sung-wook Shin <edp1096@users.noreply.github.com>
Robey Holderith <robey@flaminglunchbox.net> Robey Holderith <robey@flaminglunchbox.net>
Robyn <robyngraf@users.noreply.github.com> Robyn <robyngraf@users.noreply.github.com>
@ -599,9 +559,7 @@ Roni <sulpher@gmx.net>
Ronny Brendel <ronnybrendel@gmail.com> Ronny Brendel <ronnybrendel@gmail.com>
Ronsor <ronsor@ronsor.pw> Ronsor <ronsor@ronsor.pw>
Rowan Hart <rowanbhart@gmail.com> Rowan Hart <rowanbhart@gmail.com>
Ruan <47767371+ruanych@users.noreply.github.com>
Ruchira Hasaranga <ruchira66@gmail.com> Ruchira Hasaranga <ruchira66@gmail.com>
Rudi Servo <rudiservo@gmail.com>
Ruixin Huang <18860020911@163.com> Ruixin Huang <18860020911@163.com>
Rune <43761327+Rune-AI@users.noreply.github.com> Rune <43761327+Rune-AI@users.noreply.github.com>
RunningLeon <maningsheng@sensetime.com> RunningLeon <maningsheng@sensetime.com>
@ -665,14 +623,12 @@ Steven Roussey <sroussey@gmail.com>
Steward Garcia <57494570+FSSRepo@users.noreply.github.com> Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com> StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
Sukriti Sharma <Ssukriti@users.noreply.github.com>
SuperUserNameMan <yoann@terminajones.com> SuperUserNameMan <yoann@terminajones.com>
Sutou Kouhei <kou@cozmixng.org> Sutou Kouhei <kou@cozmixng.org>
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com> Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
Taikono-Himazin <kazu@po.harenet.ne.jp> Taikono-Himazin <kazu@po.harenet.ne.jp>
Tameem <113388789+AhmadTameem@users.noreply.github.com> Tameem <113388789+AhmadTameem@users.noreply.github.com>
Tamotsu Takahashi <ttakah+github@gmail.com> Tamotsu Takahashi <ttakah+github@gmail.com>
Tei Home <taiteitonghome@proton.me>
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com> Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
Thatcher Chamberlin <j.thatcher.c@gmail.com> Thatcher Chamberlin <j.thatcher.c@gmail.com>
Theia Vogel <theia@vgel.me> Theia Vogel <theia@vgel.me>
@ -684,7 +640,6 @@ Tim Miller <drasticactions@users.noreply.github.com>
Tim Wang <overocean@gmail.com> Tim Wang <overocean@gmail.com>
Timmy Knight <r2d2fish@gmail.com> Timmy Knight <r2d2fish@gmail.com>
Timothy Cronin <40186632+4imothy@users.noreply.github.com> Timothy Cronin <40186632+4imothy@users.noreply.github.com>
Ting Lou <louting@189.cn>
Ting Lou <ting.lou@gmail.com> Ting Lou <ting.lou@gmail.com>
Ting Sun <suntcrick@gmail.com> Ting Sun <suntcrick@gmail.com>
Tobias Lütke <tobi@shopify.com> Tobias Lütke <tobi@shopify.com>
@ -706,7 +661,6 @@ Uzo Nweke <uzoechi@gmail.com>
Vaibhav Srivastav <vaibhavs10@gmail.com> Vaibhav Srivastav <vaibhavs10@gmail.com>
Val Kharitonov <mail@kharvd.com> Val Kharitonov <mail@kharvd.com>
Valentin Konovalov <valle.ketsujin@gmail.com> Valentin Konovalov <valle.ketsujin@gmail.com>
Valentin Mamedov <45292985+Inf1delis@users.noreply.github.com>
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
Vali Malinoiu <0x4139@gmail.com> Vali Malinoiu <0x4139@gmail.com>
Victor Nogueira <felladrin@gmail.com> Victor Nogueira <felladrin@gmail.com>
@ -719,17 +673,13 @@ Vladimir Malyutin <first-leon@yandex.ru>
Vladimir Zorin <vladimir@deviant.guru> Vladimir Zorin <vladimir@deviant.guru>
VoidIsVoid <343750470@qq.com> VoidIsVoid <343750470@qq.com>
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
Wang Qin <37098874+wangqin0@users.noreply.github.com>
Wang Ran (汪然) <wangr@smail.nju.edu.cn>
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
Weird Constructor <weirdconstructor@gmail.com> Weird Constructor <weirdconstructor@gmail.com>
Welby Seely <welbyseely@gmail.com> Welby Seely <welbyseely@gmail.com>
Wentai Zhang <rchardx@gmail.com> Wentai Zhang <rchardx@gmail.com>
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
William Tambellini <william.tambellini@gmail.com> William Tambellini <william.tambellini@gmail.com>
William Tambellini <wtambellini@sdl.com>
Willy Tarreau <w@1wt.eu> Willy Tarreau <w@1wt.eu>
Woof Dog <197125663+woof-dog@users.noreply.github.com>
Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com> Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
Wu Jian Ping <wujjpp@hotmail.com> Wu Jian Ping <wujjpp@hotmail.com>
Wu Jian Ping <wujp@greatld.com> Wu Jian Ping <wujp@greatld.com>
@ -742,7 +692,6 @@ Xie Yanbo <xieyanbo@gmail.com>
Xingchen Song(宋星辰) <xingchensong1996@163.com> Xingchen Song(宋星辰) <xingchensong1996@163.com>
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com> Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
Xuan Son Nguyen <thichthat@gmail.com> Xuan Son Nguyen <thichthat@gmail.com>
Xuan-Son Nguyen <thichthat@gmail.com>
Yaiko <elyaiko@hotmail.com> Yaiko <elyaiko@hotmail.com>
Yann Follet <131855179+YannFollet@users.noreply.github.com> Yann Follet <131855179+YannFollet@users.noreply.github.com>
Yaroslav <yaroslav.yashin@me.com> Yaroslav <yaroslav.yashin@me.com>
@ -753,9 +702,7 @@ Yoshi Suhara <y.suhara@gmail.com>
Yoshi Suhara <ysuhara@nvidia.com> Yoshi Suhara <ysuhara@nvidia.com>
Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
Yüg <eugeniosegalaweb@gmail.com>
Yui <dev@sleepyyui.com> Yui <dev@sleepyyui.com>
Yun Dou <dixyes@gmail.com>
Yuri Khrustalev <ykhrustalev@users.noreply.github.com> Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com> Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
@ -767,23 +714,18 @@ Zhang Peiyuan <a1286225768@gmail.com>
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Zhiyuan Li <lizhiyuan@uniartisan.com> Zhiyuan Li <lizhiyuan@uniartisan.com>
Zhiyuan Li <uniartisan2017@gmail.com>
ZhouYuChen <zhouyuchen@naver.com> ZhouYuChen <zhouyuchen@naver.com>
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com> Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
Zsapi <martin1.zsapka@gmail.com> Zsapi <martin1.zsapka@gmail.com>
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com> a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
a3sh <38979186+A3shTnT@users.noreply.github.com>
adel boussaken <netdur@gmail.com> adel boussaken <netdur@gmail.com>
afrideva <95653597+afrideva@users.noreply.github.com> afrideva <95653597+afrideva@users.noreply.github.com>
ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
agray3 <agray3@users.noreply.github.com> agray3 <agray3@users.noreply.github.com>
akawrykow <142945436+akawrykow@users.noreply.github.com> akawrykow <142945436+akawrykow@users.noreply.github.com>
alek3y <44779186+alek3y@users.noreply.github.com>
alexpinel <93524949+alexpinel@users.noreply.github.com> alexpinel <93524949+alexpinel@users.noreply.github.com>
alonfaraj <alonfaraj@gmail.com> alonfaraj <alonfaraj@gmail.com>
alwqx <kenan3015@gmail.com> alwqx <kenan3015@gmail.com>
amd-dwang <dong.wang@amd.com>
amd-lalithnc <lalithnc@amd.com> amd-lalithnc <lalithnc@amd.com>
amritahs-ibm <amritahs@linux.vnet.ibm.com> amritahs-ibm <amritahs@linux.vnet.ibm.com>
andrijdavid <david@geek.mg> andrijdavid <david@geek.mg>
@ -795,7 +737,6 @@ arch-btw <57669023+arch-btw@users.noreply.github.com>
arcrank <arcrank@gmail.com> arcrank <arcrank@gmail.com>
ardfork <134447697+ardfork@users.noreply.github.com> ardfork <134447697+ardfork@users.noreply.github.com>
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
aryantandon01 <80969509+aryantandon01@users.noreply.github.com>
at8u <129688334+at8u@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com>
automaticcat <daogiatuank54@gmail.com> automaticcat <daogiatuank54@gmail.com>
awatuna <23447591+awatuna@users.noreply.github.com> awatuna <23447591+awatuna@users.noreply.github.com>
@ -810,14 +751,12 @@ bryanSwk <93190252+bryanSwk@users.noreply.github.com>
bsilvereagle <bsilvereagle@users.noreply.github.com> bsilvereagle <bsilvereagle@users.noreply.github.com>
bssrdf <merlintiger@hotmail.com> bssrdf <merlintiger@hotmail.com>
byte-6174 <88070277+byte-6174@users.noreply.github.com> byte-6174 <88070277+byte-6174@users.noreply.github.com>
cduk <19917266+cduk@users.noreply.github.com>
cebtenzzre <cebtenzzre@gmail.com> cebtenzzre <cebtenzzre@gmail.com>
chaihahaha <chai836275709@gmail.com> chaihahaha <chai836275709@gmail.com>
chiranko <96988916+chiranko@users.noreply.github.com> chiranko <96988916+chiranko@users.noreply.github.com>
clibdev <52199778+clibdev@users.noreply.github.com> clibdev <52199778+clibdev@users.noreply.github.com>
clyang <clyang@clyang.net> clyang <clyang@clyang.net>
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
codezjx <code.zjx@gmail.com>
coezbek <c.oezbek@gmail.com> coezbek <c.oezbek@gmail.com>
comex <comexk@gmail.com> comex <comexk@gmail.com>
compilade <113953597+compilade@users.noreply.github.com> compilade <113953597+compilade@users.noreply.github.com>
@ -841,17 +780,14 @@ drbh <david.richard.holtz@gmail.com>
ds5t5 <145942675+ds5t5@users.noreply.github.com> ds5t5 <145942675+ds5t5@users.noreply.github.com>
dylan <canardleteer@users.noreply.github.com> dylan <canardleteer@users.noreply.github.com>
eastriver <lee@eastriver.dev> eastriver <lee@eastriver.dev>
ebraminio <ebrahim@gnu.org>
ebraminio <ebraminio@gmail.com> ebraminio <ebraminio@gmail.com>
eiery <19350831+eiery@users.noreply.github.com> eiery <19350831+eiery@users.noreply.github.com>
eric8607242 <e0928021388@gmail.com> eric8607242 <e0928021388@gmail.com>
fairydreaming <166155368+fairydreaming@users.noreply.github.com> fairydreaming <166155368+fairydreaming@users.noreply.github.com>
fengerhu1 <2748250768@qq.com> fengerhu1 <2748250768@qq.com>
fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com>
github-actions[bot] <github-actions[bot]@users.noreply.github.com> github-actions[bot] <github-actions[bot]@users.noreply.github.com>
gliptic <gliptic@users.noreply.github.com> gliptic <gliptic@users.noreply.github.com>
gn64 <yukikaze.jp@gmail.com>
goerch <jhr.walter@t-online.de> goerch <jhr.walter@t-online.de>
grahameth <96447521+grahameth@users.noreply.github.com> grahameth <96447521+grahameth@users.noreply.github.com>
gtygo <gtydoit@gmail.com> gtygo <gtydoit@gmail.com>
@ -876,12 +812,10 @@ icppWorld <124377669+icppWorld@users.noreply.github.com>
igarnier <igarnier@protonmail.com> igarnier <igarnier@protonmail.com>
intelmatt <61025942+intelmatt@users.noreply.github.com> intelmatt <61025942+intelmatt@users.noreply.github.com>
iohub <rickyang.pro@gmail.com> iohub <rickyang.pro@gmail.com>
issixx <46835150+issixx@users.noreply.github.com>
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
jameswu2014 <545426914@qq.com> jameswu2014 <545426914@qq.com>
jdomke <28772296+jdomke@users.noreply.github.com> jdomke <28772296+jdomke@users.noreply.github.com>
jiahao su <damow890@gmail.com>
jiez <373447296@qq.com> jiez <373447296@qq.com>
jneem <joeneeman@gmail.com> jneem <joeneeman@gmail.com>
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
@ -894,7 +828,6 @@ junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
jwj7140 <32943891+jwj7140@users.noreply.github.com> jwj7140 <32943891+jwj7140@users.noreply.github.com>
k.h.lai <adrian.k.h.lai@outlook.com> k.h.lai <adrian.k.h.lai@outlook.com>
kaizau <kaizau@users.noreply.github.com> kaizau <kaizau@users.noreply.github.com>
kallewoof <kalle.alm@gmail.com>
kalomaze <66376113+kalomaze@users.noreply.github.com> kalomaze <66376113+kalomaze@users.noreply.github.com>
kang <tpdns9032100@gmail.com> kang <tpdns9032100@gmail.com>
katsu560 <118887472+katsu560@users.noreply.github.com> katsu560 <118887472+katsu560@users.noreply.github.com>
@ -902,7 +835,6 @@ kchro3 <62481661+kchro3@users.noreply.github.com>
khimaros <me@khimaros.com> khimaros <me@khimaros.com>
kiltyj <kiltyj@gmail.com> kiltyj <kiltyj@gmail.com>
klosax <131523366+klosax@users.noreply.github.com> klosax <131523366+klosax@users.noreply.github.com>
krystiancha <krystian@krystianch.com>
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
kunnis <kunnis@users.noreply.github.com> kunnis <kunnis@users.noreply.github.com>
kuronekosaiko <EvanChanJ@163.com> kuronekosaiko <EvanChanJ@163.com>
@ -915,8 +847,6 @@ ldwang <ftgreat@163.com>
le.chang <cljs118@126.com> le.chang <cljs118@126.com>
leejet <leejet714@gmail.com> leejet <leejet714@gmail.com>
leo-pony <nengjunma@outlook.com> leo-pony <nengjunma@outlook.com>
lexasub <lexakopp2212@gmail.com>
lhez <quic_lih@quicinc.com>
limitedAtonement <limitedAtonement@users.noreply.github.com> limitedAtonement <limitedAtonement@users.noreply.github.com>
liuwei-git <14815172+liuwei-git@users.noreply.github.com> liuwei-git <14815172+liuwei-git@users.noreply.github.com>
lon <114724657+longregen@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com>
@ -925,13 +855,10 @@ ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
luoyu-intel <yu.luo@intel.com> luoyu-intel <yu.luo@intel.com>
m3ndax <adrian.goessl@outlook.com> m3ndax <adrian.goessl@outlook.com>
maddes8cht <55592906+maddes8cht@users.noreply.github.com> maddes8cht <55592906+maddes8cht@users.noreply.github.com>
mahorozte <41834471+mahorozte@users.noreply.github.com>
makomk <makosoft@googlemail.com> makomk <makosoft@googlemail.com>
manikbhandari <mbbhandarimanik2@gmail.com> manikbhandari <mbbhandarimanik2@gmail.com>
maor-ps <154728172+maor-ps@users.noreply.github.com> maor-ps <154728172+maor-ps@users.noreply.github.com>
mashdragon <122402293+mashdragon@users.noreply.github.com>
matiaslin <45382001+matiaslin@users.noreply.github.com> matiaslin <45382001+matiaslin@users.noreply.github.com>
matt23654 <matthew.webber@protonmail.com>
matteo <matteogeniaccio@yahoo.it> matteo <matteogeniaccio@yahoo.it>
mdrokz <mohammadmunshi@gmail.com> mdrokz <mohammadmunshi@gmail.com>
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
@ -941,7 +868,6 @@ mmyjona <jonathan.gonse@gmail.com>
momonga <115213907+mmnga@users.noreply.github.com> momonga <115213907+mmnga@users.noreply.github.com>
momonga <146910567+mmngays@users.noreply.github.com> momonga <146910567+mmngays@users.noreply.github.com>
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
musoles <135031143+musoles@users.noreply.github.com>
mzcu <milos.cubrilo@gmail.com> mzcu <milos.cubrilo@gmail.com>
nanahi <130121847+na-na-hi@users.noreply.github.com> nanahi <130121847+na-na-hi@users.noreply.github.com>
ngc92 <7938269+ngc92@users.noreply.github.com> ngc92 <7938269+ngc92@users.noreply.github.com>
@ -959,7 +885,6 @@ oobabooga <112222186+oobabooga@users.noreply.github.com>
opparco <parco.opaai@gmail.com> opparco <parco.opaai@gmail.com>
ostix360 <55257054+ostix360@users.noreply.github.com> ostix360 <55257054+ostix360@users.noreply.github.com>
pculliton <phillipculliton@gmail.com> pculliton <phillipculliton@gmail.com>
peidaqi <peidaqi@gmail.com>
pengxin99 <pengxin.yuan@intel.com> pengxin99 <pengxin.yuan@intel.com>
perserk <perserk@gmail.com> perserk <perserk@gmail.com>
piDack <104877312+piDack@users.noreply.github.com> piDack <104877312+piDack@users.noreply.github.com>
@ -967,12 +892,10 @@ pmysl <piotr.myslinski@outlook.com>
postmasters <namnguyen@google.com> postmasters <namnguyen@google.com>
pudepiedj <pudepiedj@gmail.com> pudepiedj <pudepiedj@gmail.com>
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com> qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
qingy1337 <qxli2@students.everettcc.edu>
qouoq <qouoq@fastmail.com> qouoq <qouoq@fastmail.com>
qunash <anzoria@gmail.com> qunash <anzoria@gmail.com>
rabidcopy <rabidcopy@yahoo.com> rabidcopy <rabidcopy@yahoo.com>
rankaiyx <rankaiyx@rankaiyx.com> rankaiyx <rankaiyx@rankaiyx.com>
redbeard <bharrington@alticon.net>
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com> rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
rhuddleston <ryan.huddleston@percona.com> rhuddleston <ryan.huddleston@percona.com>
rimoliga <53384203+rimoliga@users.noreply.github.com> rimoliga <53384203+rimoliga@users.noreply.github.com>
@ -989,7 +912,6 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
slaren <2141330+slaren@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com>
slaren <slarengh@gmail.com> slaren <slarengh@gmail.com>
snadampal <87143774+snadampal@users.noreply.github.com> snadampal <87143774+snadampal@users.noreply.github.com>
someone13574 <81528246+someone13574@users.noreply.github.com>
standby24x7 <standby24x7@gmail.com> standby24x7 <standby24x7@gmail.com>
staviq <staviq@gmail.com> staviq <staviq@gmail.com>
stduhpf <stephduh@live.fr> stduhpf <stephduh@live.fr>
@ -1009,7 +931,6 @@ uint256_t <konndennsa@gmail.com>
uint256_t <maekawatoshiki1017@gmail.com> uint256_t <maekawatoshiki1017@gmail.com>
unbounded <haakon@likedan.net> unbounded <haakon@likedan.net>
uvos <devnull@uvos.xyz> uvos <devnull@uvos.xyz>
uvos <philipp@uvos.xyz>
valiray <133289098+valiray@users.noreply.github.com> valiray <133289098+valiray@users.noreply.github.com>
vb <vaibhavs10@gmail.com> vb <vaibhavs10@gmail.com>
vik <vikhyatk@gmail.com> vik <vikhyatk@gmail.com>
@ -1030,7 +951,6 @@ xaedes <xaedes@googlemail.com>
xctan <axunlei@gmail.com> xctan <axunlei@gmail.com>
xloem <0xloem@gmail.com> xloem <0xloem@gmail.com>
yangli2 <yangli2@gmail.com> yangli2 <yangli2@gmail.com>
ymcki <84055651+ymcki@users.noreply.github.com>
yuiseki <yuiseki@gmail.com> yuiseki <yuiseki@gmail.com>
yuri@FreeBSD <yurivict@users.noreply.github.com> yuri@FreeBSD <yurivict@users.noreply.github.com>
zakkor <edward.partenie@gmail.com> zakkor <edward.partenie@gmail.com>
@ -1043,5 +963,4 @@ zrm <trustiosity.zrm@gmail.com>
杨朱 · Kiki <baofa.fan@daocloud.io> 杨朱 · Kiki <baofa.fan@daocloud.io>
源文雨 <41315874+fumiama@users.noreply.github.com> 源文雨 <41315874+fumiama@users.noreply.github.com>
蕭澧邦 <45505768+shou692199@users.noreply.github.com> 蕭澧邦 <45505768+shou692199@users.noreply.github.com>
谢乃闻 <sienaiwun@users.noreply.github.com>
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>

View file

@ -16,7 +16,6 @@ endif()
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(LLAMA_STANDALONE ON) set(LLAMA_STANDALONE ON)
@ -47,11 +46,11 @@ if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
endif() endif()
if (MSVC) if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>") add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>") add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>") add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>") add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
endif() endif()
# #
@ -80,15 +79,17 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
# 3rd party libs # 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
# override ggml options # override ggml options
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
# change the default for these ggml options # change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE) if (NOT DEFINED GGML_LLAMAFILE)
@ -118,62 +119,16 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN) llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
message(STATUS "Using -fsanitize=thread")
add_compile_options(-fsanitize=thread)
link_libraries (-fsanitize=thread)
endif()
if (LLAMA_SANITIZE_ADDRESS)
message(STATUS "Using -fsanitize=address")
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries (-fsanitize=address)
endif()
if (LLAMA_SANITIZE_UNDEFINED)
message(STATUS "Using -fsanitize=undefined")
add_compile_options(-fsanitize=undefined)
link_libraries (-fsanitize=undefined)
endif()
endif()
# #
# 3rd-party # build the library
# #
if (NOT TARGET ggml) if (NOT TARGET ggml)
add_subdirectory(ggml) add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt # ... otherwise assume ggml is added by a parent CMakeLists.txt
endif() endif()
#
# build the library
#
add_subdirectory(src) add_subdirectory(src)
#
# utils, programs, examples and tests
#
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()
# #
# install # install
# #
@ -189,14 +144,27 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
# At the moment some compile definitions are placed within the ggml/src
# directory but not exported on the `ggml` target. This could be improved by
# determining _precisely_ which defines are necessary for the llama-config
# package.
#
set(GGML_TRANSIENT_DEFINES)
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
if (GGML_DIR_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
endif()
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
if (GGML_TARGET_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
# all public headers
set(LLAMA_PUBLIC_HEADERS set(LLAMA_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h) ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
set_target_properties(llama
PROPERTIES
PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
install(TARGETS llama LIBRARY PUBLIC_HEADER) install(TARGETS llama LIBRARY PUBLIC_HEADER)
configure_package_config_file( configure_package_config_file(
@ -233,4 +201,22 @@ configure_file(cmake/llama.pc.in
@ONLY) @ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) DESTINATION lib/pkgconfig)
#
# utils, programs, examples and tests
#
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()

View file

@ -31,13 +31,6 @@
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
{
"name": "x64-windows-llvm", "hidden": true,
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
}
},
{ {
"name": "arm64-windows-msvc", "hidden": true, "name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" }, "architecture": { "value": "arm64", "strategy": "external" },
@ -77,11 +70,6 @@
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] }, { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },

View file

@ -1,11 +1,3 @@
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
/ci/ @ggerganov ci/ @ggerganov
/.devops/*.Dockerfile @ngxson
/examples/server/ @ngxson
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler

View file

@ -1,10 +1,10 @@
# Pull requests (for contributors) # Pull requests (for contributors)
- Test your changes: - Test your changes:
- Execute [the full CI locally on your machine](ci/README.md) before publishing - Execute [the full CI locally on your machine](ci/README.md) before publishing
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`) - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends) - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops` - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
@ -20,104 +20,14 @@
- Avoid adding third-party dependencies, extra files, extra headers, etc. - Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures - Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple - Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- Vertical alignment makes things more readable and easier to batch edit - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
- In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
```cpp
// OK
llama_context * ctx;
const llama_rope_type rope_type;
// not OK
struct llama_context * ctx;
const enum llama_rope_type rope_type;
```
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png) ![matmul](media/matmul.png)
# Naming guidelines
- Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
```cpp
// not OK
int small_number;
int big_number;
// OK
int number_small;
int number_big;
```
- Enum values are always in upper case and prefixed with the enum name
```cpp
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_NONE = 0,
LLAMA_VOCAB_TYPE_SPM = 1,
LLAMA_VOCAB_TYPE_BPE = 2,
LLAMA_VOCAB_TYPE_WPM = 3,
LLAMA_VOCAB_TYPE_UGM = 4,
LLAMA_VOCAB_TYPE_RWKV = 5,
};
```
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
```cpp
llama_model_init(); // class: "llama_model", method: "init"
llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed"
llama_set_embeddings(); // class: "llama_context", method: "set_embeddings"
llama_n_threads(); // class: "llama_context", method: "n_threads"
llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free"
```
- The `get` `<action>` can be omitted
- The `<noun>` can be omitted if not necessary
- The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
- Use `init`/`free` for constructor/destructor `<action>`
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
```cpp
typedef struct llama_context * llama_context_t;
enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
```
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
- Python filenames are all lowercase with underscores
- _(TODO: abbreviations usage)_
# Preprocessor directives
- _(TODO: add guidelines with examples and apply them to the codebase)_
```cpp
#ifdef FOO
#endif // FOO
```
# Documentation
- Documentation is a community effort
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
- When you notice incorrect or outdated documentation, please update it
# Resources # Resources
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:

View file

@ -22,7 +22,6 @@ BUILD_TARGETS = \
llama-infill \ llama-infill \
llama-llava-cli \ llama-llava-cli \
llama-minicpmv-cli\ llama-minicpmv-cli\
llama-qwen2vl-cli\
llama-lookahead \ llama-lookahead \
llama-lookup \ llama-lookup \
llama-lookup-create \ llama-lookup-create \
@ -52,7 +51,6 @@ TEST_TARGETS = \
tests/test-arg-parser \ tests/test-arg-parser \
tests/test-autorelease \ tests/test-autorelease \
tests/test-backend-ops \ tests/test-backend-ops \
tests/test-chat \
tests/test-chat-template \ tests/test-chat-template \
tests/test-double-float \ tests/test-double-float \
tests/test-grammar-integration \ tests/test-grammar-integration \
@ -447,10 +445,6 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
MK_CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
HOST_CXXFLAGS += -march=native -mtune=native HOST_CXXFLAGS += -march=native -mtune=native
# Usage AMX build test
#MK_CFLAGS += -march=graniterapids -mtune=graniterapids
#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
# Usage AVX-only # Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
#MK_CXXFLAGS += -mfma -mf16c -mavx #MK_CXXFLAGS += -mfma -mf16c -mavx
@ -596,7 +590,7 @@ ifdef GGML_RPC
OBJ_GGML_EXT += ggml/src/ggml-rpc.o OBJ_GGML_EXT += ggml/src/ggml-rpc.o
endif # GGML_RPC endif # GGML_RPC
OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu)) OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu)) OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
ifdef GGML_CUDA_FA_ALL_QUANTS ifdef GGML_CUDA_FA_ALL_QUANTS
@ -954,6 +948,7 @@ DIR_COMMON = common
OBJ_GGML = \ OBJ_GGML = \
$(DIR_GGML)/src/ggml.o \ $(DIR_GGML)/src/ggml.o \
$(DIR_GGML)/src/ggml-aarch64.o \
$(DIR_GGML)/src/ggml-alloc.o \ $(DIR_GGML)/src/ggml-alloc.o \
$(DIR_GGML)/src/ggml-backend.o \ $(DIR_GGML)/src/ggml-backend.o \
$(DIR_GGML)/src/ggml-backend-reg.o \ $(DIR_GGML)/src/ggml-backend-reg.o \
@ -961,11 +956,9 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-quants.o \ $(DIR_GGML)/src/ggml-quants.o \
$(DIR_GGML)/src/ggml-threading.o \ $(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
$(OBJ_GGML_EXT) $(OBJ_GGML_EXT)
OBJ_LLAMA = \ OBJ_LLAMA = \
@ -984,7 +977,6 @@ OBJ_COMMON = \
$(DIR_COMMON)/ngram-cache.o \ $(DIR_COMMON)/ngram-cache.o \
$(DIR_COMMON)/sampling.o \ $(DIR_COMMON)/sampling.o \
$(DIR_COMMON)/speculative.o \ $(DIR_COMMON)/speculative.o \
$(DIR_COMMON)/chat.o \
$(DIR_COMMON)/build-info.o \ $(DIR_COMMON)/build-info.o \
$(DIR_COMMON)/json-schema-to-grammar.o $(DIR_COMMON)/json-schema-to-grammar.o
@ -1106,10 +1098,17 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
# Default target # Default target
all: $(BUILD_TARGETS) all: $(BUILD_TARGETS)
# force c++ build for source file that have same name as c file
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp # g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/include/ggml-backend.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
# Rules for building object files # Rules for building object files
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c $(DIR_GGML)/%.o: $(DIR_GGML)/%.c
@ -1363,11 +1362,7 @@ llama-server: \
examples/server/httplib.h \ examples/server/httplib.h \
examples/server/index.html.hpp \ examples/server/index.html.hpp \
examples/server/loading.html.hpp \ examples/server/loading.html.hpp \
common/chat.cpp \
common/chat.hpp \
common/chat-template.hpp \
common/json.hpp \ common/json.hpp \
common/minja.hpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@ -1411,14 +1406,6 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift swift: examples/batched.swift
(cd examples/batched.swift; make build) (cd examples/batched.swift; make build)
@ -1475,11 +1462,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-chat: tests/test-chat.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp \ tests/test-opt: tests/test-opt.cpp \
$(OBJ_GGML) $(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

View file

@ -2,6 +2,59 @@
import PackageDescription import PackageDescription
var sources = [
"src/llama.cpp",
"src/llama-vocab.cpp",
"src/llama-grammar.cpp",
"src/llama-sampling.cpp",
"src/unicode.cpp",
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp",
"ggml/src/ggml-backend-reg.cpp",
"ggml/src/ggml-cpu/ggml-cpu.c",
"ggml/src/ggml-cpu/ggml-cpu.cpp",
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
"ggml/src/ggml-threading.cpp",
"ggml/src/ggml-quants.c",
]
var resources: [Resource] = []
var linkerSettings: [LinkerSetting] = []
var cSettings: [CSetting] = [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.unsafeFlags(["-fno-objc-arc"]),
.headerSearchPath("ggml/src"),
.headerSearchPath("ggml/src/ggml-cpu"),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
.define("GGML_USE_CPU"),
]
#if canImport(Darwin)
sources.append("ggml/src/ggml-common.h")
sources.append("ggml/src/ggml-metal/ggml-metal.m")
resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append(
contentsOf: [
.define("GGML_USE_ACCELERATE"),
.define("GGML_USE_METAL"),
]
)
#endif
#if os(Linux)
cSettings.append(.define("_GNU_SOURCE"))
#endif
let package = Package( let package = Package(
name: "llama", name: "llama",
platforms: [ platforms: [
@ -14,6 +67,26 @@ let package = Package(
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
targets: [ targets: [
.systemLibrary(name: "llama", pkgConfig: "llama"), .target(
] name: "llama",
path: ".",
exclude: [
"build",
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"Makefile",
"ggml/src/ggml-metal-embed.metal"
],
sources: sources,
resources: resources,
publicHeadersPath: "spm-headers",
cSettings: cSettings,
linkerSettings: linkerSettings
)
],
cxxLanguageStandard: .cxx17
) )

View file

@ -16,11 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics ## Hot topics
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427 - **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) - Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
@ -73,7 +69,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
- [x] [GPT-2](https://huggingface.co/gpt2) - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
- [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
@ -96,15 +91,13 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM) - [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5) - [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca) - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat) - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
#### Multimodal #### Multimodal
@ -117,8 +110,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
</details> </details>
@ -136,7 +127,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html) - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
@ -189,7 +179,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [ramalama](https://github.com/containers/ramalama) (MIT) - [ramalama](https://github.com/containers/ramalama) (MIT)
- [semperai/amica](https://github.com/semperai/amica) (MIT) - [semperai/amica](https://github.com/semperai/amica) (MIT)
- [withcatai/catai](https://github.com/withcatai/catai) (MIT) - [withcatai/catai](https://github.com/withcatai/catai) (MIT)
- [Autopen](https://github.com/blackhole89/autopen) (GPL)
</details> </details>
@ -210,8 +199,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
</details> </details>
@ -232,7 +219,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU | | [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [HIP](docs/build.md#hip) | AMD GPU | | [hipBLAS](docs/build.md#hipblas) | AMD GPU |
| [Vulkan](docs/build.md#vulkan) | GPU | | [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU | | [CANN](docs/build.md#cann) | Ascend NPU |
@ -253,8 +240,6 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending) - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf) - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
After downloading a model, use the CLI tools to run it locally - see below. After downloading a model, use the CLI tools to run it locally - see below.
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. `llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
@ -273,12 +258,21 @@ To learn more about model quantization, [read this documentation](examples/quant
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality. #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
- <details open> - <details open>
<summary>Run in conversation mode</summary> <summary>Run simple text completion</summary>
Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
```bash ```bash
llama-cli -m model.gguf llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
</details>
- <details>
<summary>Run in conversation mode</summary>
```bash
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv
# > hi, who are you? # > hi, who are you?
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today? # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
@ -290,28 +284,17 @@ To learn more about model quantization, [read this documentation](examples/quant
</details> </details>
- <details> - <details>
<summary>Run in conversation mode with custom chat template</summary> <summary>Run with custom chat template</summary>
```bash ```bash
# use the "chatml" template (use -h to see the list of supported templates) # use the "chatml" template
llama-cli -m model.gguf -cnv --chat-template chatml llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
# use a custom template # use a custom template
llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:' llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
``` ```
</details> [Supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- <details>
<summary>Run simple text completion</summary>
To disable conversation mode explicitly, use `-no-cnv`
```bash
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
</details> </details>
@ -426,10 +409,10 @@ To learn more about model quantization, [read this documentation](examples/quant
</details> </details>
[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md) [^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](examples/llama-bench) ## [`llama-bench`](example/bench)
#### Benchmark the performance of the inference for various parameters. #### Benchmark the performance of the inference for various parameters.
@ -450,20 +433,6 @@ To learn more about model quantization, [read this documentation](examples/quant
</details> </details>
## [`llama-run`](examples/run)
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
- <details>
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
```bash
llama-run granite-code
```
</details>
[^3]: [RamaLama](https://github.com/containers/ramalama)
## [`llama-simple`](examples/simple) ## [`llama-simple`](examples/simple)

View file

@ -1,4 +0,0 @@
#pragma once
#include <llama.h>

View file

@ -1,5 +0,0 @@
module llama [system] {
header "llama.h"
link "llama"
export *
}

View file

@ -326,17 +326,17 @@ function gg_run_open_llama_7b_v2 {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -460,17 +460,17 @@ function gg_run_pythia_1_4b {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -591,17 +591,17 @@ function gg_run_pythia_2_8b {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

View file

@ -44,7 +44,7 @@ if(MSVC)
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME}) set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
else() else()
execute_process( execute_process(
COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER} COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
OUTPUT_VARIABLE OUT OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
) )

View file

@ -3,13 +3,159 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(GGML_STATIC @GGML_STATIC@)
set(GGML_NATIVE @GGML_NATIVE@)
set(GGML_LTO @GGML_LTO@)
set(GGML_CCACHE @GGML_CCACHE@)
set(GGML_AVX @GGML_AVX@)
set(GGML_AVX2 @GGML_AVX2@)
set(GGML_AVX512 @GGML_AVX512@)
set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
set(GGML_AMX_TILE @GGML_AMX_TILE@)
set(GGML_AMX_INT8 @GGML_AMX_INT8@)
set(GGML_AMX_BF16 @GGML_AMX_BF16@)
set(GGML_FMA @GGML_FMA@)
set(GGML_LASX @GGML_LASX@)
set(GGML_LSX @GGML_LSX@)
set(GGML_RVV @GGML_RVV@)
set(GGML_SVE @GGML_SVE@)
set(GGML_ACCELERATE @GGML_ACCELERATE@)
set(GGML_OPENMP @GGML_OPENMP@)
set(GGML_CPU_HBM @GGML_CPU_HBM@)
set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@)
set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
set(GGML_CUDA_F16 @GGML_CUDA_F16@)
set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@)
set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@)
set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@)
set(GGML_HIP_UMA @GGML_HIP_UMA@)
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@)
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@)
set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@)
set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
set(GGML_METAL_STD @GGML_METAL_STD@)
set(GGML_SYCL_F16 @GGML_SYCL_F16@)
set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
@PACKAGE_INIT@ @PACKAGE_INIT@
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake) find_package(Threads REQUIRED)
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
set(_llama_link_deps "")
set(_llama_link_opts "")
foreach(_ggml_lib ggml ggml-base)
string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
find_library(${_ggml_lib_var} ${_ggml_lib}
REQUIRED
HINTS ${LLAMA_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH
)
list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
message(STATUS "Found ${${_ggml_lib_var}}")
endforeach()
foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
string(TOUPPER "GGML_${backend}" backend_id)
set(_ggml_lib "ggml-${backend}")
string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
find_library(${_ggml_lib_var} ${_ggml_lib}
HINTS ${LLAMA_LIB_DIR}
NO_CMAKE_FIND_ROOT_PATH
)
if(${_ggml_lib_var})
list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
set(${backend_id} ON)
message(STATUS "Found backend ${${_ggml_lib_var}}")
else()
set(${backend_id} OFF)
endif()
endforeach()
if (NOT LLAMA_SHARED_LIB)
if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
endif()
if (GGML_OPENMP)
find_package(OpenMP REQUIRED)
list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
endif()
if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED)
list(APPEND _llama_link_deps memkind)
endif()
if (GGML_BLAS)
find_package(BLAS REQUIRED)
list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
endif()
if (GGML_CUDA)
find_package(CUDAToolkit REQUIRED)
endif()
if (GGML_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
endif()
if (GGML_VULKAN)
find_package(Vulkan REQUIRED)
list(APPEND _llama_link_deps Vulkan::Vulkan)
endif()
if (GGML_HIP)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
endif()
if (GGML_SYCL)
find_package(DNNL)
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND _llama_link_deps DNNL::dnnl)
endif()
if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
endif()
endif()
endif()
find_library(llama_LIBRARY llama find_library(llama_LIBRARY llama
REQUIRED REQUIRED
@ -21,10 +167,12 @@ add_library(llama UNKNOWN IMPORTED)
set_target_properties(llama set_target_properties(llama
PROPERTIES PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;" INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
INTERFACE_LINK_OPTIONS "${_llama_link_opts}"
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${llama_LIBRARY}" IMPORTED_LOCATION "${llama_LIBRARY}"
INTERFACE_COMPILE_FEATURES c_std_90 INTERFACE_COMPILE_FEATURES cxx_std_11
POSITION_INDEPENDENT_CODE ON) POSITION_INDEPENDENT_CODE ON )
check_required_components(Llama) check_required_components(Llama)

View file

@ -1,10 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@ prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=@CMAKE_INSTALL_PREFIX@ exec_prefix=${prefix}
libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libdir=${exec_prefix}/lib
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ includedir=${prefix}/include
Name: llama Name: llama
Description: Port of Facebook's LLaMA model in C/C++ Description: Port of Facebook's LLaMA model in C/C++
Version: @LLAMA_INSTALL_VERSION@ Version: @PROJECT_VERSION@
Libs: -L${libdir} -lggml -lggml-base -lllama Libs: -L${libdir} -lllama
Cflags: -I${includedir} Cflags: -I${includedir}

View file

@ -1,11 +0,0 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( arch_c_flags "-march=native" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )

View file

@ -56,19 +56,14 @@ add_library(${TARGET} STATIC
arg.cpp arg.cpp
arg.h arg.h
base64.hpp base64.hpp
chat.cpp
chat.hpp
chat-template.hpp
common.cpp common.cpp
common.h common.h
console.cpp console.cpp
console.h console.h
json-schema-to-grammar.cpp json-schema-to-grammar.cpp
json.hpp json.hpp
llguidance.cpp
log.cpp log.cpp
log.h log.h
minja.hpp
ngram-cache.cpp ngram-cache.cpp
ngram-cache.h ngram-cache.h
sampling.cpp sampling.cpp
@ -86,39 +81,12 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url # Use curl to download model url
if (LLAMA_CURL) if (LLAMA_CURL)
find_package(CURL REQUIRED) find_package(CURL REQUIRED)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) add_definitions(-DLLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS}) include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED) find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif () endif ()
if (LLAMA_LLGUIDANCE)
include(ExternalProject)
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.6.12:
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo build --release
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
UPDATE_COMMAND ""
)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
add_library(llguidance STATIC IMPORTED)
set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
add_dependencies(llguidance llguidance_ext)
target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
endif ()
target_include_directories(${TARGET} PUBLIC .) target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_17) target_compile_features (${TARGET} PUBLIC cxx_std_17)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

View file

@ -22,11 +22,6 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
return *this; return *this;
} }
common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
this->excludes = std::move(excludes);
return *this;
}
common_arg & common_arg::set_env(const char * env) { common_arg & common_arg::set_env(const char * env) {
help = help + "\n(env: " + env + ")"; help = help + "\n(env: " + env + ")";
this->env = env; this->env = env;
@ -42,10 +37,6 @@ bool common_arg::in_example(enum llama_example ex) {
return examples.find(ex) != examples.end(); return examples.find(ex) != examples.end();
} }
bool common_arg::is_exclude(enum llama_example ex) {
return excludes.find(ex) != excludes.end();
}
bool common_arg::get_value_from_env(std::string & output) { bool common_arg::get_value_from_env(std::string & output) {
if (env == nullptr) return false; if (env == nullptr) return false;
char * value = std::getenv(env); char * value = std::getenv(env);
@ -128,75 +119,32 @@ std::string common_arg::to_string() {
// utils // utils
// //
static void common_params_handle_model_default( static void common_params_handle_model_default(common_params & params) {
std::string & model, if (!params.hf_repo.empty()) {
const std::string & model_url,
std::string & hf_repo,
std::string & hf_file,
const std::string & hf_token,
const std::string & model_default) {
if (!hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model // short-hand to avoid specifying --hf-file -> default it to --model
if (hf_file.empty()) { if (params.hf_file.empty()) {
if (model.empty()) { if (params.model.empty()) {
auto auto_detected = common_get_hf_file(hf_repo, hf_token); throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
if (auto_detected.first.empty() || auto_detected.second.empty()) {
exit(1); // built without CURL, error message already printed
}
hf_repo = auto_detected.first;
hf_file = auto_detected.second;
} else {
hf_file = model;
} }
} params.hf_file = params.model;
// make sure model path is present (for caching purposes) } else if (params.model.empty()) {
if (model.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs // this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = hf_repo + "_" + hf_file; std::string filename = params.hf_repo + "_" + params.hf_file;
// to make sure we don't have any slashes in the filename // to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_"); string_replace_all(filename, "/", "_");
model = fs_get_cache_file(filename); params.model = fs_get_cache_file(filename);
} }
} else if (!model_url.empty()) { } else if (!params.model_url.empty()) {
if (model.empty()) { if (params.model.empty()) {
auto f = string_split<std::string>(model_url, '#').front(); auto f = string_split<std::string>(params.model_url, '#').front();
f = string_split<std::string>(f, '?').front(); f = string_split<std::string>(f, '?').front();
model = fs_get_cache_file(string_split<std::string>(f, '/').back()); params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
} }
} else if (model.empty()) { } else if (params.model.empty()) {
model = model_default; params.model = DEFAULT_MODEL_PATH;
} }
} }
const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_Q8_0,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_IQ4_NL,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
};
static ggml_type kv_cache_type_from_str(const std::string & s) {
for (const auto & type : kv_cache_types) {
if (ggml_type_name(type) == s) {
return type;
}
}
throw std::runtime_error("Unsupported cache type: " + s);
}
static std::string get_all_kv_cache_types() {
std::ostringstream msg;
for (const auto & type : kv_cache_types) {
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
}
return msg.str();
}
// //
// CLI argument parsing functions // CLI argument parsing functions
// //
@ -299,10 +247,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
} }
// TODO: refactor model params in a common struct common_params_handle_model_default(params);
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH);
common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, "");
if (params.escape) { if (params.escape) {
string_process_escapes(params.prompt); string_process_escapes(params.prompt);
@ -325,14 +270,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both"); throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
} }
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
throw std::runtime_error(string_format(
"error: the supplied chat template is not supported: %s%s\n",
params.chat_template.c_str(),
params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
));
}
return true; return true;
} }
@ -386,30 +323,6 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
return devices; return devices;
} }
static void add_rpc_devices(std::string servers) {
auto rpc_servers = string_split<std::string>(servers, ',');
if (rpc_servers.empty()) {
throw std::invalid_argument("no RPC servers specified");
}
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
throw std::invalid_argument("failed to find RPC backend");
}
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
throw std::invalid_argument("failed to find RPC device add function");
}
for (const auto & server : rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
ggml_backend_device_register(dev);
} else {
throw std::invalid_argument("failed to register RPC device");
}
}
}
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage); auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params const common_params params_org = ctx_arg.params; // the example can modify the default params
@ -472,7 +385,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
*/ */
auto add_opt = [&](common_arg arg) { auto add_opt = [&](common_arg arg) {
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) { if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
ctx_arg.options.push_back(std::move(arg)); ctx_arg.options.push_back(std::move(arg));
} }
}; };
@ -678,7 +591,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.ctx_shift = false; params.ctx_shift = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg( add_opt(common_arg(
{"--chunks"}, "N", {"--chunks"}, "N",
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -701,7 +614,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.prompt = value; params.prompt = value;
} }
).set_excludes({LLAMA_EXAMPLE_SERVER})); ));
add_opt(common_arg( add_opt(common_arg(
{"--no-perf"}, {"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@ -725,7 +638,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.prompt.pop_back(); params.prompt.pop_back();
} }
} }
).set_excludes({LLAMA_EXAMPLE_SERVER})); ));
add_opt(common_arg( add_opt(common_arg(
{"--in-file"}, "FNAME", {"--in-file"}, "FNAME",
"an input file (repeat to specify multiple files)", "an input file (repeat to specify multiple files)",
@ -752,7 +665,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.prompt = ss.str(); params.prompt = ss.str();
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
} }
).set_excludes({LLAMA_EXAMPLE_SERVER})); ));
add_opt(common_arg( add_opt(common_arg(
{"-e", "--escape"}, {"-e", "--escape"},
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
@ -811,19 +724,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-cnv", "--conversation"}, {"-cnv", "--conversation"},
"run in conversation mode:\n" string_format(
"- does not print special tokens and suffix/prefix\n" "run in conversation mode:\n"
"- interactive mode is also enabled\n" "- does not print special tokens and suffix/prefix\n"
"(default: auto enabled if chat template is available)", "- interactive mode is also enabled\n"
"(default: %s)",
params.conversation ? "true" : "false"
),
[](common_params & params) { [](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; params.conversation = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"-no-cnv", "--no-conversation"},
"force disable conversation mode (default: false)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
@ -877,7 +786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.warmup = false; params.warmup = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
{"--spm-infill"}, {"--spm-infill"},
string_format( string_format(
@ -904,7 +813,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--sampling-seq", "--sampler-seq"}, "SEQUENCE", {"--sampling-seq"}, "SEQUENCE",
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sampling.samplers = common_sampler_types_from_chars(value); params.sampling.samplers = common_sampler_types_from_chars(value);
@ -917,6 +826,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.ignore_eos = true; params.sampling.ignore_eos = true;
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg(
{"--penalize-nl"},
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
[](common_params & params) {
params.sampling.penalize_nl = true;
}
).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--temp"}, "N", {"--temp"}, "N",
string_format("temperature (default: %.1f)", (double)params.sampling.temp), string_format("temperature (default: %.1f)", (double)params.sampling.temp),
@ -971,9 +887,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--repeat-last-n"}, "N", {"--repeat-last-n"}, "N",
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
[](common_params & params, int value) { [](common_params & params, int value) {
if (value < -1) {
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
}
params.sampling.penalty_last_n = value; params.sampling.penalty_last_n = value;
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
} }
@ -1028,9 +941,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--dry-penalty-last-n"}, "N", {"--dry-penalty-last-n"}, "N",
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
[](common_params & params, int value) { [](common_params & params, int value) {
if (value < -1) {
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
}
params.sampling.dry_penalty_last_n = value; params.sampling.dry_penalty_last_n = value;
} }
).set_sparam()); ).set_sparam());
@ -1264,28 +1174,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE", {"-ctk", "--cache-type-k"}, "TYPE",
string_format( string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
"KV cache data type for K\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.cache_type_k)
),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.cache_type_k = kv_cache_type_from_str(value); // TODO: get the type right here
params.cache_type_k = value;
} }
).set_env("LLAMA_ARG_CACHE_TYPE_K")); ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
add_opt(common_arg( add_opt(common_arg(
{"-ctv", "--cache-type-v"}, "TYPE", {"-ctv", "--cache-type-v"}, "TYPE",
string_format( string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
"KV cache data type for V\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.cache_type_v)
),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.cache_type_v = kv_cache_type_from_str(value); // TODO: get the type right here
params.cache_type_v = value;
} }
).set_env("LLAMA_ARG_CACHE_TYPE_V")); ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
add_opt(common_arg( add_opt(common_arg(
@ -1419,8 +1319,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--rpc"}, "SERVERS", {"--rpc"}, "SERVERS",
"comma separated list of RPC servers", "comma separated list of RPC servers",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
add_rpc_devices(value); params.rpc_servers = value;
GGML_UNUSED(params);
} }
).set_env("LLAMA_ARG_RPC")); ).set_env("LLAMA_ARG_RPC"));
} }
@ -1465,28 +1364,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--list-devices"}, {"--list-devices"},
"print list of available devices and exit", "print list of available devices and exit",
[](common_params &) { [](common_params &) {
std::vector<ggml_backend_dev_t> rpc_devices; printf("Available devices:\n");
std::vector<ggml_backend_dev_t> all_devices;
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i); auto * dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); size_t free, total;
if (ggml_backend_reg_name(reg) == std::string("RPC")) { ggml_backend_dev_memory(dev, &free, &total);
rpc_devices.push_back(dev); printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
} else {
all_devices.push_back(dev);
}
} }
} }
// insert RPC devices in front
all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
printf("Available devices:\n");
for (size_t i = 0; i < all_devices.size(); ++i) {
auto * dev = all_devices[i];
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
}
exit(0); exit(0);
} }
)); ));
@ -1582,7 +1468,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--lora"}, "FNAME", {"--lora"}, "FNAME",
"path to LoRA adapter (can be repeated to use multiple adapters)", "path to LoRA adapter (can be repeated to use multiple adapters)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); params.lora_adapters.push_back({ std::string(value), 1.0 });
} }
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@ -1590,7 +1476,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--lora-scaled"}, "FNAME", "SCALE", {"--lora-scaled"}, "FNAME", "SCALE",
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
[](common_params & params, const std::string & fname, const std::string & scale) { [](common_params & params, const std::string & fname, const std::string & scale) {
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); params.lora_adapters.push_back({ fname, std::stof(scale) });
} }
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@ -1644,42 +1530,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_env("LLAMA_ARG_MODEL_URL")); ).set_env("LLAMA_ARG_MODEL_URL"));
add_opt(common_arg( add_opt(common_arg(
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]", {"-hfr", "--hf-repo"}, "REPO",
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" "Hugging Face model repository (default: unused)",
"example: unsloth/phi-4-GGUF:q4_k_m\n"
"(default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.hf_repo = value; params.hf_repo = value;
} }
).set_env("LLAMA_ARG_HF_REPO")); ).set_env("LLAMA_ARG_HF_REPO"));
add_opt(common_arg(
{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
"Same as --hf-repo, but for the draft model (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.hf_repo = value;
}
).set_env("LLAMA_ARG_HFD_REPO"));
add_opt(common_arg( add_opt(common_arg(
{"-hff", "--hf-file"}, "FILE", {"-hff", "--hf-file"}, "FILE",
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", "Hugging Face model file (default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.hf_file = value; params.hf_file = value;
} }
).set_env("LLAMA_ARG_HF_FILE")); ).set_env("LLAMA_ARG_HF_FILE"));
add_opt(common_arg(
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
"Hugging Face model repository for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.hf_repo = value;
}
).set_env("LLAMA_ARG_HF_REPO_V"));
add_opt(common_arg(
{"-hffv", "--hf-file-v"}, "FILE",
"Hugging Face model file for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.hf_file = value;
}
).set_env("LLAMA_ARG_HF_FILE_V"));
add_opt(common_arg( add_opt(common_arg(
{"-hft", "--hf-token"}, "TOKEN", {"-hft", "--hf-token"}, "TOKEN",
"Hugging Face access token (default: value from HF_TOKEN environment variable)", "Hugging Face access token (default: value from HF_TOKEN environment variable)",
@ -1848,13 +1711,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value; params.public_path = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
[](common_params & params) {
params.webui = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
add_opt(common_arg( add_opt(common_arg(
{"--embedding", "--embeddings"}, {"--embedding", "--embeddings"},
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -1968,44 +1824,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
} }
).set_examples({LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--jinja"},
"use jinja template for chat (default: disabled)",
[](common_params & params) {
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg( add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE", {"--chat-template"}, "JINJA_TEMPLATE",
string_format( string_format(
"set custom jinja chat template (default: template taken from model's metadata)\n" "set custom jinja chat template (default: template taken from model's metadata)\n"
"if suffix/prefix are specified, template will be disabled\n" "if suffix/prefix are specified, template will be disabled\n"
"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str() "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
), ),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
if (!common_chat_verify_template(value)) {
throw std::runtime_error(string_format(
"error: the supplied chat template is not supported: %s\n"
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
value.c_str()
));
}
params.chat_template = value; params.chat_template = value;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
"set custom jinja chat template file (default: template taken from model's metadata)\n"
"if suffix/prefix are specified, template will be disabled\n"
"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
),
[](common_params & params, const std::string & value) {
std::ifstream file(value);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
}
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(params.chat_template));
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg( add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY", {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@ -2240,35 +2076,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_max = value; params.speculative.n_max = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--draft-min", "--draft-n-min"}, "N", {"--draft-min", "--draft-n-min"}, "N",
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_min = value; params.speculative.n_min = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--draft-p-split"}, "P", {"--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.p_split = std::stof(value); params.speculative.p_split = std::stof(value);
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(common_arg( add_opt(common_arg(
{"--draft-p-min"}, "P", {"--draft-p-min"}, "P",
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.p_min = std::stof(value); params.speculative.p_min = std::stof(value);
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N", {"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_ctx = value; params.speculative.n_ctx = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-devd", "--device-draft"}, "<dev1,dev2,..>", {"-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@ -2288,83 +2124,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
} }
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-md", "--model-draft"}, "FNAME", {"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)", "draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.model = value; params.speculative.model = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-mv", "--model-vocoder"}, "FNAME",
"vocoder model for audio generation (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.model = value;
}
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--tts-use-guide-tokens"},
"Use guide tokens to improve TTS word recall",
[](common_params & params) {
params.vocoder.use_guide_tokens = true;
}
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
// model-specific
add_opt(common_arg(
{"--tts-oute-default"},
string_format("use default OuteTTS models (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
}
).set_examples({LLAMA_EXAMPLE_TTS}));
add_opt(common_arg(
{"--embd-bge-small-en-default"},
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--embd-e5-small-en-default"},
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
params.hf_file = "e5-small-v2-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--embd-gte-small-default"},
string_format("use default gte-small model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
params.hf_file = "gte-small-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
return ctx_arg; return ctx_arg;
} }

View file

@ -12,7 +12,6 @@
struct common_arg { struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON}; std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::set<enum llama_example> excludes = {};
std::vector<const char *> args; std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value const char * value_hint_2 = nullptr; // for second arg value
@ -54,11 +53,9 @@ struct common_arg {
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
common_arg & set_examples(std::initializer_list<enum llama_example> examples); common_arg & set_examples(std::initializer_list<enum llama_example> examples);
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
common_arg & set_env(const char * env); common_arg & set_env(const char * env);
common_arg & set_sparam(); common_arg & set_sparam();
bool in_example(enum llama_example ex); bool in_example(enum llama_example ex);
bool is_exclude(enum llama_example ex);
bool get_value_from_env(std::string & output); bool get_value_from_env(std::string & output);
bool has_value_from_env(); bool has_value_from_env();
std::string to_string(); std::string to_string();

View file

@ -1,529 +0,0 @@
/*
Copyright 2024 Google LLC
Use of this source code is governed by an MIT-style
license that can be found in the LICENSE file or at
https://opensource.org/licenses/MIT.
*/
// SPDX-License-Identifier: MIT
#pragma once
#include "minja.hpp"
#include <json.hpp>
#include <string>
#include <vector>
using json = nlohmann::ordered_json;
namespace minja {
struct chat_template_caps {
bool supports_tools = false;
bool supports_tool_calls = false;
bool supports_tool_responses = false;
bool supports_system_role = false;
bool supports_parallel_tool_calls = false;
bool supports_tool_call_id = false;
// meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object.
// Most other templates (and OpenAI's API) expect the arguments object to be stringified.
bool requires_object_arguments = false;
// CohereForAI/c4ai-command-r-plus simple variant
bool requires_non_null_content = false;
// MiniMaxAI/MiniMax-Text-01 special
bool requires_typed_content = false;
};
struct chat_template_inputs {
nlohmann::ordered_json messages;
nlohmann::ordered_json tools;
bool add_generation_prompt = true;
nlohmann::ordered_json extra_context;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
};
struct chat_template_options {
bool apply_polyfills = true;
bool use_bos_token = true;
bool use_eos_token = true;
bool define_strftime_now = true;
bool polyfill_tools = true;
bool polyfill_tool_call_examples = true;
bool polyfill_tool_calls = true;
bool polyfill_tool_responses = true;
bool polyfill_system_role = true;
bool polyfill_object_arguments = true;
bool polyfill_typed_content = true;
};
class chat_template {
private:
chat_template_caps caps_;
std::string source_;
std::string bos_token_;
std::string eos_token_;
std::shared_ptr<minja::TemplateNode> template_root_;
std::string tool_call_example_;
std::string try_raw_render(
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
{
try {
chat_template_inputs inputs;
inputs.messages = messages;
inputs.tools = tools;
inputs.add_generation_prompt = add_generation_prompt;
inputs.extra_context = extra_context;
// Use fixed date for tests
inputs.now = std::chrono::system_clock::from_time_t(0);
chat_template_options opts;
opts.apply_polyfills = false;
auto prompt = apply(inputs, opts);
// fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
return prompt;
} catch (const std::exception & e) {
// fprintf(stderr, "try_raw_render error: %s\n", e.what());
return "";
}
}
public:
chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
: source_(source), bos_token_(bos_token), eos_token_(eos_token)
{
template_root_ = minja::Parser::parse(source_, {
/* .trim_blocks = */ true,
/* .lstrip_blocks = */ true,
/* .keep_trailing_newline = */ false,
});
auto contains = [](const std::string & haystack, const std::string & needle) {
return haystack.find(needle) != std::string::npos;
};
const std::string user_needle = "<User Needle>";
const std::string sys_needle = "<System Needle>";
const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}};
const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}};
caps_.requires_typed_content =
!contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle)
&& contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle);
const auto dummy_user_msg = caps_.requires_typed_content
? dummy_typed_user_msg
: dummy_str_user_msg;
const json needle_system_msg = {
{"role", "system"},
{"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)},
};
caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle);
auto out = try_raw_render(json::array({
dummy_user_msg
}), json::array({
{
{"name", "some_tool"},
{"type", "function"},
{"function", {
{"name", "some_tool"},
{"description", "Some tool."},
{"parameters", {
{"type", "object"},
{"properties", {
{"arg", {
{"type", "string"},
{"description", "Some argument."},
}},
}},
{"required", json::array({ "arg" })},
}},
}},
},
}), false);
caps_.supports_tools = contains(out, "some_tool");
auto make_tool_calls_msg = [&](const json & tool_calls) {
return json {
{"role", "assistant"},
{"content", nullptr},
{"tool_calls", tool_calls},
};
};
auto make_tool_call = [](const std::string & tool_name, const json & arguments) {
return json {
{"id", "call_1___"},
{"type", "function"},
{"function", {
{"arguments", arguments},
{"name", tool_name},
}},
};
};
const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}};
// Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want.
out = try_raw_render(json::array({
dummy_user_msg,
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
}), {}, false);
auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
out = try_raw_render(json::array({
dummy_user_msg,
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
}), {}, false);
auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
if (caps_.supports_tool_calls) {
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
auto tc1 = make_tool_call("test_tool1", dummy_args);
auto tc2 = make_tool_call("test_tool2", dummy_args);
auto out = try_raw_render(json::array({
dummy_user_msg,
make_tool_calls_msg(json::array({tc1, tc2})),
}), {}, false);
caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2");
out = try_raw_render(json::array({
dummy_user_msg,
make_tool_calls_msg(json::array({tc1})),
{
{"role", "tool"},
{"name", "test_tool1"},
{"content", "Some response!"},
{"tool_call_id", "call_911_"},
}
}), {}, false);
caps_.supports_tool_responses = contains(out, "Some response!");
caps_.supports_tool_call_id = contains(out, "call_911_");
}
try {
if (!caps_.supports_tools) {
const json user_msg {
{"role", "user"},
{"content", "Hey"},
};
const json args {
{"arg1", "some_value"},
};
const json tool_call_msg {
{"role", "assistant"},
{"content", nullptr},
{"tool_calls", json::array({
{
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
{"id", "call_1___"},
{"type", "function"},
{"function", {
{"name", "tool_name"},
{"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))},
}},
},
})},
};
std::string prefix, full;
{
chat_template_inputs inputs;
inputs.messages = json::array({user_msg});
inputs.add_generation_prompt = true;
prefix = apply(inputs);
}
{
chat_template_inputs inputs;
inputs.messages = json::array({user_msg, tool_call_msg});
inputs.add_generation_prompt = false;
full = apply(inputs);
}
auto eos_pos_last = full.rfind(eos_token_);
if (eos_pos_last == prefix.size() - eos_token_.size() ||
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
full = full.substr(0, eos_pos_last);
}
size_t common_prefix_length = 0;
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
if (prefix[i] != full[i]) {
break;
}
if (prefix[i] == '<') {
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
// but it removes thinking tags for past messages.
// The prefix and full strings diverge at <think> vs. <tool▁calls▁begin>, we avoid consuming the leading <.
continue;
}
common_prefix_length = i + 1;
}
auto example = full.substr(common_prefix_length);
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
} else {
tool_call_example_ = example;
}
}
} catch (const std::exception & e) {
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
}
}
const std::string & source() const { return source_; }
const std::string & bos_token() const { return bos_token_; }
const std::string & eos_token() const { return eos_token_; }
const chat_template_caps & original_caps() const { return caps_; }
// Deprecated, please use the form with chat_template_inputs and chat_template_options
std::string apply(
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
bool apply_polyfills = true)
{
fprintf(stderr, "[%s] Deprecated!\n", __func__);
chat_template_inputs inputs;
inputs.messages = messages;
inputs.tools = tools;
inputs.add_generation_prompt = add_generation_prompt;
inputs.extra_context = extra_context;
inputs.now = std::chrono::system_clock::now();
chat_template_options opts;
opts.apply_polyfills = apply_polyfills;
return apply(inputs, opts);
}
std::string apply(
const chat_template_inputs & inputs,
const chat_template_options & opts = chat_template_options()) const
{
json actual_messages;
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_tool_calls = false;
auto has_tool_responses = false;
auto has_string_content = false;
for (const auto & message : inputs.messages) {
if (message.contains("tool_calls") && !message["tool_calls"].is_null()) {
has_tool_calls = true;
}
if (message.contains("role") && message["role"] == "tool") {
has_tool_responses = true;
}
if (message.contains("content") && message["content"].is_string()) {
has_string_content = true;
}
}
auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content;
auto needs_polyfills = opts.apply_polyfills && (false
|| polyfill_system_role
|| polyfill_tools
|| polyfill_tool_calls
|| polyfill_tool_responses
|| polyfill_object_arguments
|| polyfill_typed_content
);
if (needs_polyfills) {
actual_messages = json::array();
auto add_message = [&](const json & msg) {
if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
actual_messages.push_back({
{"role", msg.at("role")},
{"content", {{
{"type", "text"},
{"text", msg.at("content")},
}}},
});
} else {
actual_messages.push_back(msg);
}
};
std::string pending_system;
auto flush_sys = [&]() {
if (!pending_system.empty()) {
add_message({
{"role", "user"},
{"content", pending_system},
});
pending_system.clear();
}
};
json adjusted_messages;
if (polyfill_tools) {
adjusted_messages = add_system(inputs.messages,
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
} else {
adjusted_messages = inputs.messages;
}
for (const auto & message_ : adjusted_messages) {
auto message = message_;
if (!message.contains("role") || !message.contains("content")) {
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
}
std::string role = message.at("role");
if (message.contains("tool_calls")) {
if (polyfill_object_arguments || polyfill_tool_calls) {
for (auto & tool_call : message.at("tool_calls")) {
if (tool_call["type"] == "function") {
auto & function = tool_call.at("function");
auto & arguments = function.at("arguments");
if (arguments.is_string()) {
try {
arguments = json::parse(arguments.get<std::string>());
} catch (const std::exception & ecvt) {
fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what());
}
}
}
}
}
if (polyfill_tool_calls) {
auto content = message.at("content");
auto tool_calls = json::array();
for (const auto & tool_call : message.at("tool_calls")) {
if (tool_call.at("type") != "function") {
continue;
}
const auto & function = tool_call.at("function");
auto tc = json {
{"name", function.at("name")},
{"arguments", function.at("arguments")},
};
if (tool_call.contains("id")) {
tc["id"] = tool_call["id"];
}
tool_calls.push_back(tc);
}
auto obj = json {
{"tool_calls", tool_calls},
};
if (!content.is_null() && content != "") {
obj["content"] = content;
}
message["content"] = obj.dump(2);
message.erase("tool_calls");
}
}
if (polyfill_tool_responses && role == "tool") {
message["role"] = "user";
auto obj = json {
{"tool_response", {
{"content", message.at("content")},
}},
};
if (message.contains("name")) {
obj["tool_response"]["name"] = message.at("name");
}
if (message.contains("tool_call_id")) {
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
}
message["content"] = obj.dump(2);
message.erase("name");
}
if (!message["content"].is_null() && polyfill_system_role) {
std::string content = message.at("content");
if (role == "system") {
if (!pending_system.empty()) pending_system += "\n";
pending_system += content;
continue;
} else {
if (role == "user") {
if (!pending_system.empty()) {
message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
pending_system.clear();
}
} else {
flush_sys();
}
}
}
add_message(message);
}
flush_sys();
} else {
actual_messages = inputs.messages;
}
auto context = minja::Context::make(json({
{"messages", actual_messages},
{"add_generation_prompt", inputs.add_generation_prompt},
}));
context->set("bos_token", opts.use_bos_token ? bos_token_ : "");
context->set("eos_token", opts.use_eos_token ? eos_token_ : "");
if (opts.define_strftime_now) {
auto now = inputs.now;
context->set("strftime_now", Value::callable([now](const std::shared_ptr<minja::Context> &, minja::ArgumentsValue & args) {
args.expectArgs("strftime_now", {1, 1}, {0, 0});
auto format = args.args[0].get<std::string>();
auto time = std::chrono::system_clock::to_time_t(now);
auto local_time = *std::localtime(&time);
std::ostringstream ss;
ss << std::put_time(&local_time, format.c_str());
return ss.str();
}));
}
if (!inputs.tools.is_null()) {
context->set("tools", minja::Value(inputs.tools));
}
if (!inputs.extra_context.is_null()) {
for (auto & kv : inputs.extra_context.items()) {
context->set(kv.key(), minja::Value(kv.value()));
}
}
auto ret = template_root_->render(context);
// fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str());
// fprintf(stderr, "apply: %s\n\n", ret.c_str());
return ret;
}
static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
json messages_with_system = messages;
if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
std::string existing_system = messages_with_system.at(0).at("content");
messages_with_system[0] = json {
{"role", "system"},
{"content", existing_system + "\n\n" + system_prompt},
};
} else {
messages_with_system.insert(messages_with_system.begin(), json {
{"role", "system"},
{"content", system_prompt},
});
}
return messages_with_system;
}
};
} // namespace minja

View file

@ -1,966 +0,0 @@
#include "chat.hpp"
#include "chat-template.hpp"
#include "json-schema-to-grammar.h"
#include "log.h"
#include "minja.hpp"
std::string common_chat_format_name(common_chat_format format) {
switch (format) {
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
default:
throw std::runtime_error("Unknown chat format");
}
}
const common_grammar_options grammar_options {
/* .dotall = */ false,
/* .compact_spaces = */ false,
// /* .compact_spaces = */ true,
};
static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
// // https://json.nlohmann.me/features/parsing/sax_interface/
struct json_error_locator : public nlohmann::json_sax<json> {
std::size_t position;
bool found_error;
json_error_locator() : position(0), found_error(false) {}
bool parse_error(std::size_t position, const std::string &, const json::exception &) override {
this->position = position - 1;
this->found_error = true;
return false;
}
bool null() override { return true; }
bool boolean(bool) override { return true; }
bool number_integer(number_integer_t) override { return true; }
bool number_unsigned(number_unsigned_t) override { return true; }
bool number_float(number_float_t, const string_t &) override { return true; }
bool string(string_t &) override { return true; }
bool binary(binary_t &) override { return true; }
bool start_object(std::size_t) override { return true; }
bool key(string_t &) override { return true; }
bool end_object() override { return true; }
bool start_array(std::size_t) override { return true; }
bool end_array() override { return true; }
};
json_error_locator err_loc;
json::sax_parse(it, end, &err_loc);
std::string::const_iterator temptative_end;
if (err_loc.found_error) {
temptative_end = it + err_loc.position;
} else {
temptative_end = end;
}
std::string json_sub {it, temptative_end};
try {
out = json::parse(json_sub);
it = temptative_end;
return true;
} catch (const std::exception &) {
return false;
}
}
/**
* Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
* Aggregates the prefix, suffix and in-between text into the content.
*/
static common_chat_msg parse_json_tool_calls(
const std::string& input,
const std::optional<std::regex> & trigger_opt,
const std::regex & function_regex,
const std::regex & close_regex) {
std::smatch match;
common_chat_msg result;
result.role = "assistant";
auto end = input.end();
auto it = input.begin();
if (trigger_opt) {
if (!std::regex_search(it, end, match, *trigger_opt)) {
result.content = input;
return result;
}
result.content = match.prefix().str();
it = match.suffix().first;
}
while (it != end) {
std::sregex_iterator rend;
std::sregex_iterator rit(it, end, function_regex);
if (rit == rend) {
fprintf(stderr, "No more tool calls found\n");
result.content += std::string(it, end);
break;
}
auto name = rit->str(1);
result.content += std::string(it, rit->prefix().second);
it = rit->suffix().first;
json arguments;
if (!parse_json(it, end, arguments)) {
throw std::runtime_error("Failed to parse json tool call arguments");
}
if (!std::regex_search(it, end, match, close_regex)) {
throw std::runtime_error("Malformed input, missing closing pattern");
}
it = match.suffix().first;
result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
}
return result;
}
static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
auto content_end = input.find(prefix);
size_t tc_start = std::string::npos;
common_chat_msg result;
result.role = "assistant";
const auto process_tool_calls = [&](const json & tool_calls) {
for (const auto & tool_call : tool_calls) {
const auto & arguments = tool_call["arguments"];
result.tool_calls.push_back({
tool_call["name"],
arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
tool_call.contains("id") ? tool_call["id"] : "",
});
}
};
if (content_end == std::string::npos) {
result.content = input;
} else {
tc_start = content_end + prefix.size() - rstrip_prefix;
result.content = input.substr(0, content_end);
auto tool_calls = json::parse(input.substr(tc_start));
process_tool_calls(tool_calls);
}
return result;
}
static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
for (const auto & tool : tools) {
if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) {
LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str());
continue;
}
fn(tool);
}
}
static std::string apply(
const common_chat_template & tmpl,
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
{
minja::chat_template_inputs tmpl_inputs;
tmpl_inputs.messages = messages;
tmpl_inputs.tools = tools;
tmpl_inputs.add_generation_prompt = add_generation_prompt;
tmpl_inputs.extra_context = extra_context;
// TODO: add flag to control date/time, if only for testing purposes.
// tmpl_inputs.now = std::chrono::system_clock::now();
minja::chat_template_options tmpl_opts;
tmpl_opts.use_bos_token = false;
tmpl_opts.use_eos_token = false;
return tmpl.apply(tmpl_inputs, tmpl_opts);
}
static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
common_chat_params data;
auto tool_call_schemas = json::array();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
auto tool_schema = json {
{"type", "object"},
{"properties", {
{"name", {
{"type", "string"},
{"const", function["name"]},
}},
{"arguments", function["parameters"]},
}},
{"required", json::array({"name", "arguments"})},
};
if (function.contains("description")) {
tool_schema["description"] = function["description"];
}
if (inputs.parallel_tool_calls) {
tool_schema["properties"]["id"] = {
{"type", "string"},
{"minLength", 4},
};
tool_schema["required"].push_back("id");
}
tool_call_schemas.emplace_back(tool_schema);
});
const auto tool_call =
inputs.parallel_tool_calls
? json {
{"type", "object"},
{"properties", {
{"tool_calls", {
{"type", "array"},
{"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
{"anyOf", tool_call_schemas},
}},
{"minItems", 1},
}},
}},
{"required", json::array({"tool_calls"})},
}
: json {
{"type", "object"},
{"properties", {
{"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
{"anyOf", tool_call_schemas},
}},
}},
{"required", json::array({"tool_call"})},
};
const auto schema =
inputs.tool_choice != "required"
? json {
{"anyOf", json::array({
tool_call,
{
{"type", "object"},
{"properties", {
{"response", inputs.json_schema.is_null()
? json {{"type", "string"}}
: inputs.json_schema
},
}},
{"required", json::array({"response"})},
},
})}
}
: tool_call;
data.grammar_lazy = false;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
builder.add_schema("root", schema);
}, grammar_options);
auto tweaked_messages = common_chat_template::add_system(
inputs.messages,
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_GENERIC;
return data;
}
static common_chat_msg common_chat_parse_generic(const std::string & input) {
json data = json::parse(input);
common_chat_msg result;
result.role = "assistant";
if (data.contains("tool_calls")) {
for (const auto & tool_call : data["tool_calls"]) {
result.tool_calls.push_back({
tool_call["name"],
tool_call["arguments"].dump(),
tool_call.contains("id") ? tool_call["id"] : "",
});
}
} else if (data.contains("tool_call")) {
result.tool_calls.push_back({
data["tool_call"]["name"],
data["tool_call"]["arguments"].dump(),
/* id= */ "",
});
} else if (data.contains("response")) {
const auto & response = data["response"];
result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
}
return result;
}
static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
common_chat_params data;
data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
auto schemas = json::array();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
schemas.push_back({
{"type", "object"},
{"properties", {
// Important note: the model is probably trained to take a JSON stringified arguments value.
// It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object.
{"name", {
{"type", "string"},
{"const", function["name"]},
}},
{"arguments", function["parameters"]},
{"id", {
{"type", "string"},
// Nemo's template expects a 9-character alphanumeric ID.
{"pattern", "^[a-zA-Z0-9]{9}$"},
}},
}},
{"required", json::array({"name", "arguments", "id"})},
});
});
auto schema = json {
{"type", "array"},
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
{"minItems", 1},
};
if (!inputs.parallel_tool_calls) {
schema["maxItems"] = 1;
}
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
}, grammar_options);
data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
return data;
}
static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) {
return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]");
}
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
common_chat_params data;
data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
auto schemas = json::array();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
schemas.push_back({
{"type", "object"},
{"properties", {
{"tool_call_id", {
{"type", "string"},
// Command-R's template expects an integer string.
{"pattern", "^[0-9]{1,10}$"},
}},
{"tool_name", {
{"type", "string"},
{"const", function["name"]},
}},
{"parameters", function["parameters"]},
}},
{"required", json::array({"tool_call_id", "tool_name", "parameters"})},
});
});
auto schema = json {
{"type", "array"},
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
{"minItems", 1},
};
if (!inputs.parallel_tool_calls) {
schema["maxItems"] = 1;
}
builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
}, grammar_options);
data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
data.preserved_tokens = {
"<|START_RESPONSE|>",
"<|END_RESPONSE|>",
"<|START_THINKING|>",
"<|END_THINKING|>",
"<|END_ACTION|>",
};
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
return data;
}
static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
std::smatch match;
common_chat_msg result;
result.role = "assistant";
if (std::regex_match(input, match, response_regex)) {
result.content = match[1].str();
} else if (std::regex_match(input, match, thought_action_regex)) {
result.tool_plan = match[1].str();
auto actions_str = match[2].str();
auto actions = json::parse(actions_str);
for (const auto & action : actions) {
result.tool_calls.push_back({
/* .name = */ action["tool_name"],
/* .arguments = */ action["parameters"].dump(),
/* .id = */ action["tool_call_id"],
});
}
} else {
LOG_ERR("Failed to parse command_r output");
result.content = input;
}
return result;
}
static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties");
}
const auto & parameters_properties = parameters.at("properties");
const auto & parameters_required = parameters.at("required");
for (const auto & prop : expected_properties) {
if (!parameters_properties.contains(prop)) {
throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop);
}
if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) {
throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop);
}
}
if (parameters_properties.size() != expected_properties.size()) {
throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", "));
}
}
static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct common_chat_inputs & inputs, bool allow_python_tag_builtin_tools) {
auto builtin_tools = json::array();
common_chat_params data;
data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
if (name == "wolfram_alpha") {
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
expect_tool_parameters(name, parameters, {"query"});
} else if (name == "web_search" || name == "brave_search") {
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
expect_tool_parameters(name, parameters, {"query"});
} else if (name == "python" || name == "code_interpreter") {
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
expect_tool_parameters(name, parameters, {"code"});
} else {
return false;
}
std::vector<std::string> kvs;
for (const auto & [key, value] : parameters.at("properties").items()) {
kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value));
}
tool_rules.push_back(
builder.add_rule(
name + "-call",
"\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
builtin_tools.push_back(name);
return true;
};
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
std::string name = function["name"];
auto parameters = function["parameters"];
builder.resolve_refs(parameters);
// https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
if (allow_python_tag_builtin_tools) {
handle_builtin_tool(name, parameters);
}
tool_rules.push_back(
builder.add_rule(
name + "-call",
"\"{\" space "
"( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
"\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
builder.add_schema(name + "-args", parameters) +
" \"}\""));
data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
});
data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
if (!builtin_tools.empty()) {
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
}
builder.add_rule("root", string_join(tool_rules, " | "));
}, grammar_options);
data.additional_stops.push_back("<|eom_id|>");
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
{"tools_in_user_message", false},
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
});
data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
: COMMON_CHAT_FORMAT_LLAMA_3_X;
return data;
}
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
// TODO: tighten & simplify the parser, don't accept leading text context.
static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
static std::regex close_regex("\\}");
static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
if (with_builtin_tools) {
std::smatch match;
if (std::regex_match(input, match, builtin_call_regex)) {
auto name = match[1].str();
auto raw_args = match[2].str();
// TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
auto it_eq = raw_args.find('=');
auto arg_name = raw_args.substr(0, it_eq);
auto arg_value_str = raw_args.substr(it_eq + 1);
auto arg_value = json::parse(arg_value_str);
return {
/* .role = */ "assistant",
/* .content = */ match.prefix().str(),
/* .tool_calls = */ {
{
/* .name = */ match[1],
/* .arguments = */ (json {
{arg_name, arg_value},
}).dump(),
/* .id = */ "",
},
},
};
}
}
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
}
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
common_chat_params data;
data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
std::string name = function["name"];
auto parameters = function["parameters"];
auto args_rule = builder.add_schema(name + "-args", parameters);
tool_rules.push_back(builder.add_rule(name + "-call",
"\"<tool▁call▁begin>function<tool▁sep>" + name + "\\n```json\\n\" " + args_rule + " \"```<tool▁call▁end>\""));
});
data.grammar_triggers.push_back({"<tool▁calls▁begin>", /* .at_start = */ false});
data.preserved_tokens = {
"<tool▁sep>",
"<tool▁call▁end>",
};
builder.add_rule("root", "\"<tool▁calls▁begin>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space");
}, grammar_options);
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
return data;
}
static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) {
static std::regex trigger_regex("<tool▁calls▁begin>");
static std::regex function_regex("<tool▁call▁begin>function<tool▁sep>([^\n]+)\n```json\n");
static std::regex close_regex("```<tool▁call▁end>");
return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex);
}
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
fprintf(stderr, "%s\n", __func__);
common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
{"datetime", "Jan 29 2025 13:00:00 GMT"},
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
});
if (!inputs.tools.is_null() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
auto schemas = json::array();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
schemas.push_back({
{"type", "object"},
{"properties", {
{"name", {
{"type", "string"},
{"const", function["name"]},
}},
{"arguments", function["parameters"]},
}},
{"required", json::array({"name", "arguments", "id"})},
});
});
auto schema = json {
{"type", "array"},
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
{"minItems", 1},
};
if (!inputs.parallel_tool_calls) {
schema["maxItems"] = 1;
}
builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
}, grammar_options);
data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
} else {
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
return data;
}
static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) {
return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1);
}
static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
// >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
if (!inputs.tools.is_null() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> first_tool_rules;
std::vector<std::string> subsequent_tool_rules;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
std::string name = function["name"];
auto parameters = function["parameters"];
auto args_rule = builder.add_schema(name + "-args", parameters);
first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
data.grammar_triggers.push_back({name, /* .at_start = */ true});
data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
});
auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
if (inputs.parallel_tool_calls) {
auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*");
} else {
builder.add_rule("root", first_rule);
}
}, grammar_options);
}
return data;
}
static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
auto expected_it = expected.begin();
auto tmp_it = it;
while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
++tmp_it;
++expected_it;
}
if (expected_it == expected.end()) {
it = tmp_it;
return true;
}
return false;
}
static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
static std::regex close_regex(R"($|(?=>>>))");
std::string content;
auto it = input.begin();
const auto end = input.end();
if (consume(it, end, "all\n")) {
std::smatch match;
if (std::regex_search(it, end, match, function_regex)) {
auto fun_it = match.prefix().second;
content = std::string(it, fun_it);
it = fun_it;
} else {
common_chat_msg res;
res.role = "assistant";
res.content = std::string(it, end);
return res;
}
}
// TODO: tighten & simplify.
try {
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
res.content = content + res.content;
return res;
} catch (const std::exception & e) {
LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
common_chat_msg res;
res.role = "assistant";
res.content = input;
return res;
}
}
static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
// https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
common_chat_params data;
json tools = inputs.tools.is_null() ? inputs.tools : json::array();
std::string python_code_argument_name;
auto has_raw_python = false;
data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
const auto & parameters = function["parameters"];
std::string name = function["name"];
if (name == "python" || name == "ipython") {
if (!parameters.contains("type")) {
throw std::runtime_error("Missing type in python tool");
}
has_raw_python = true;
auto type = parameters.at("type");
if (type == "object") {
auto properties = parameters.at("properties");
for (auto it = properties.begin(); it != properties.end(); ++it) {
if (it.value().at("type") == "string") {
if (!python_code_argument_name.empty()) {
throw std::runtime_error("Multiple string arguments found in python tool");
}
python_code_argument_name = it.key();
}
}
if (python_code_argument_name.empty()) {
throw std::runtime_error("No string argument found in python tool");
}
} else if (type != "string") {
throw std::runtime_error("Invalid type in python tool: " + type.dump());
}
}
tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
});
if (has_raw_python) {
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
}
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
}, grammar_options);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
// TODO: if (has_raw_python)
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
return data;
}
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
std::smatch match;
if (std::regex_search(input, match, python_tag_regex)) {
auto code = match[1].str();
return {
/* .role = */ "assistant",
/* .content = */ match.prefix().str(),
/* .tool_calls = */ {
{
/* .name = */ "python",
/* .arguments = */ (json {{"code", code}}).dump(),
/* .id = */ "",
},
}
};
}
static std::regex function_regex(R"(<function=(\w+)>)");
static std::regex close_regex(R"(</function>)");
// TODO: tighten & simplify.
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
}
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
common_chat_params data;
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
data.grammar_lazy = inputs.tool_choice != "required";
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool["function"];
std::string name = function["name"];
auto parameters = function["parameters"];
builder.resolve_refs(parameters);
tool_rules.push_back(builder.add_schema(name + "-call", {
{"type", "object"},
{"properties", json {
{"name", json {{"const", name}}},
{"arguments", parameters},
}},
{"required", json::array({"name", "arguments"})},
}));
});
auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
data.preserved_tokens = { "</tool_call>" };
}, grammar_options);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
return data;
}
static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
try {
std::regex start_pattern(R"([\n\s]*<tool_call>)");
std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
auto end = input.end();
std::sregex_iterator rend;
std::sregex_iterator rit(input.begin(), end, start_pattern);
if (rit == rend) {
return {
/* .role = */ "assistant",
/* .content = */ input,
/* .tool_calls = */ {},
};
}
common_chat_msg result;
result.role = "assistant";
result.content = rit->prefix();
auto it = rit->suffix().first;
while (it != end) {
json call;
if (!parse_json(it, end, call)) {
throw std::runtime_error("Failed to parse json tool call");
}
const auto & arguments = call["arguments"];
result.tool_calls.push_back({
call["name"],
arguments.dump(),
// arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
/* id= */ "",
});
rit = {it, end, middle_pattern};
if (rit != rend) {
it = rit->suffix().first;
} else {
rit = {it, end, end_pattern};
if (rit == rend) {
throw std::runtime_error("Malformed input, missing </tool_call>");
}
break;
}
}
return result;
} catch (const std::exception & e) {
return {
/* .role = */ "assistant",
/* .content = */ input,
/* .tool_calls = */ {},
};
}
}
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
data.grammar_lazy = false;
if (!inputs.json_schema.is_null()) {
if (!inputs.grammar.empty()) {
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
}
data.grammar = json_schema_to_grammar(inputs.json_schema);
} else {
data.grammar = inputs.grammar.empty();
}
return data;
}
common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none";
LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false");
if (has_tools && !inputs.grammar.empty()) {
throw std::runtime_error("Cannot specify grammar with tools");
}
const auto & src = tmpl.source();
if (src.find(">>>all") != std::string::npos) {
// Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when
return common_chat_params_init_functionary_v3_2(tmpl, inputs);
}
if (src.find(" functools[") != std::string::npos) {
// Firefunction v2 requires datetime and functions in the context, even w/o tools.
return common_chat_params_init_firefunction_v2(tmpl, inputs);
}
if (!has_tools) {
return common_chat_params_init_without_tools(tmpl, inputs);
}
if (src.find("<tool_call>") != std::string::npos) {
return common_chat_params_init_hermes_2_pro(tmpl, inputs);
}
if (src.find("<|start_header_id|>") != std::string::npos
&& src.find("<function=") != std::string::npos) {
return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, inputs);
}
if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools);
}
if (src.find("<tool▁calls▁begin>") != std::string::npos) {
return common_chat_params_init_deepseek_r1(tmpl, inputs);
}
if (src.find("[TOOL_CALLS]") != std::string::npos) {
return common_chat_params_init_mistral_nemo(tmpl, inputs);
}
if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) {
return common_chat_params_init_command_r7b(tmpl, inputs);
}
return common_chat_params_init_generic(tmpl, inputs);
}
static common_chat_msg common_chat_parse_content_only(const std::string & input) {
return {
/* .role = */ "assistant",
/* .content = */ input,
/* .tool_calls = */ {},
};
}
common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) {
switch (format) {
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
return common_chat_parse_content_only(input);
case COMMON_CHAT_FORMAT_GENERIC:
return common_chat_parse_generic(input);
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
return common_chat_parse_mistral_nemo(input);
case COMMON_CHAT_FORMAT_LLAMA_3_X:
return common_chat_parse_llama_3_1(input);
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
return common_chat_parse_deepseek_r1(input);
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
return common_chat_parse_functionary_v3_2(input);
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
return common_chat_parse_functionary_v3_1_llama_3_1(input);
case COMMON_CHAT_FORMAT_HERMES_2_PRO:
return common_chat_parse_hermes_2_pro(input);
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
return common_chat_parse_firefunction_v2(input);
case COMMON_CHAT_FORMAT_COMMAND_R7B:
return common_chat_parse_command_r7b(input);
default:
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
}
}

View file

@ -1,52 +0,0 @@
// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
#pragma once
#include "common.h"
#include <json.hpp>
#include <optional>
#include <string>
#include <vector>
using json = nlohmann::ordered_json;
struct common_chat_inputs {
json messages;
json tools;
json tool_choice;
json json_schema;
bool parallel_tool_calls;
bool stream;
std::string grammar;
bool add_generation_prompt = true;
};
enum common_chat_format {
COMMON_CHAT_FORMAT_CONTENT_ONLY,
COMMON_CHAT_FORMAT_GENERIC,
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
COMMON_CHAT_FORMAT_LLAMA_3_X,
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
COMMON_CHAT_FORMAT_HERMES_2_PRO,
COMMON_CHAT_FORMAT_COMMAND_R7B,
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
struct common_chat_params {
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
json prompt;
std::string grammar;
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_triggers;
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;
};
struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
std::string common_chat_format_name(common_chat_format format);
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);

View file

@ -2,9 +2,6 @@
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif #endif
#include "ggml.h"
#include "gguf.h"
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT: // Change JSON_ASSERT from assert() to GGML_ASSERT:
@ -12,8 +9,6 @@
#include "json.hpp" #include "json.hpp"
#include "json-schema-to-grammar.h" #include "json-schema-to-grammar.h"
#include "llama.h" #include "llama.h"
#include "chat.hpp"
#include "chat-template.hpp"
#include <algorithm> #include <algorithm>
#include <cinttypes> #include <cinttypes>
@ -23,7 +18,6 @@
#include <cstdarg> #include <cstdarg>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <filesystem>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <iterator> #include <iterator>
@ -68,29 +62,11 @@
#ifdef __linux__ #ifdef __linux__
#include <linux/limits.h> #include <linux/limits.h>
#elif defined(_WIN32) #elif defined(_WIN32)
# if !defined(PATH_MAX) #define PATH_MAX MAX_PATH
# define PATH_MAX MAX_PATH
# endif
#else #else
#include <sys/syslimits.h> #include <sys/syslimits.h>
#endif #endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
};
#endif // LLAMA_USE_CURL #endif // LLAMA_USE_CURL
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
@ -485,48 +461,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder); s = std::move(builder);
} }
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
std::ostringstream result;
for (size_t i = 0; i < values.size(); ++i) {
if (i > 0) {
result << separator;
}
result << values[i];
}
return result.str();
}
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
std::vector<std::string> parts;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
parts.push_back(str.substr(start, end - start));
start = end + delimiter.length();
end = str.find(delimiter, start);
}
parts.push_back(str.substr(start));
return parts;
}
std::string string_repeat(const std::string & str, size_t n) {
if (n == 0) {
return "";
}
std::string result;
result.reserve(str.length() * n);
for (size_t i = 0; i < n; ++i) {
result += str;
}
return result;
}
std::string string_from(bool value) { std::string string_from(bool value) {
return value ? "true" : "false"; return value ? "true" : "false";
} }
@ -909,7 +843,7 @@ struct common_init_result common_init_from_params(common_params & params) {
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else { } else {
model = llama_model_load_from_file(params.model.c_str(), mparams); model = llama_load_model_from_file(params.model.c_str(), mparams);
} }
if (model == NULL) { if (model == NULL) {
@ -917,28 +851,26 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams; return iparams;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
if (params.reranking) { if (params.reranking) {
bool ok = true; bool ok = true;
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) { if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__); LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__); LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) { if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__); LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
if (!ok) { if (!ok) {
llama_model_free(model); llama_free_model(model);
return iparams; return iparams;
} }
@ -946,40 +878,40 @@ struct common_init_result common_init_from_params(common_params & params) {
auto cparams = common_context_params_to_llama(params); auto cparams = common_context_params_to_llama(params);
llama_context * lctx = llama_init_from_model(model, cparams); llama_context * lctx = llama_new_context_with_model(model, cparams);
if (lctx == NULL) { if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_model_free(model); llama_free_model(model);
return iparams; return iparams;
} }
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
params.ctx_shift = false; llama_free_model(model);
return iparams;
} }
if (!params.control_vectors.empty()) { if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model); if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
const auto cvec = common_control_vector_load(params.control_vectors); const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) { if (cvec.n_embd == -1) {
llama_free(lctx); llama_free(lctx);
llama_model_free(model); llama_free_model(model);
return iparams; return iparams;
} }
int err = llama_apply_adapter_cvec( int err = llama_control_vector_apply(lctx,
lctx, cvec.data.data(),
cvec.data.data(), cvec.data.size(),
cvec.data.size(), cvec.n_embd,
cvec.n_embd, params.control_vector_layer_start,
params.control_vector_layer_start, params.control_vector_layer_end);
params.control_vector_layer_end);
if (err) { if (err) {
llama_free(lctx); llama_free(lctx);
llama_model_free(model); llama_free_model(model);
return iparams; return iparams;
} }
@ -987,54 +919,33 @@ struct common_init_result common_init_from_params(common_params & params) {
// load and optionally apply lora adapters // load and optionally apply lora adapters
for (auto & la : params.lora_adapters) { for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora; common_lora_adapter_container loaded_la;
lora.reset(llama_adapter_lora_init(model, la.path.c_str())); loaded_la.path = la.path;
if (lora == nullptr) { loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx); llama_free(lctx);
llama_model_free(model); llama_free_model(model);
return iparams; return iparams;
} }
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
la.ptr = lora.get();
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
} }
if (!params.lora_init_without_apply) { if (!params.lora_init_without_apply) {
common_set_adapter_lora(lctx, params.lora_adapters); common_lora_adapters_apply(lctx, iparams.lora_adapters);
} }
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__); LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sampling.ignore_eos = false; params.sampling.ignore_eos = false;
} }
if (params.sampling.ignore_eos) {
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
}
}
if (params.sampling.penalty_last_n == -1) {
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.penalty_last_n = llama_n_ctx(lctx);
}
if (params.sampling.dry_penalty_last_n == -1) {
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
}
if (params.warmup) { if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
std::vector<llama_token> tmp; std::vector<llama_token> tmp;
llama_token bos = llama_vocab_bos(vocab); llama_token bos = llama_token_bos(model);
llama_token eos = llama_vocab_eos(vocab); llama_token eos = llama_token_eos(model);
// some models (e.g. T5) don't have a BOS token // some models (e.g. T5) don't have a BOS token
if (bos != LLAMA_TOKEN_NULL) { if (bos != LLAMA_TOKEN_NULL) {
tmp.push_back(bos); tmp.push_back(bos);
@ -1049,7 +960,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_encoder(model)) { if (llama_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size())); llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model); llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == LLAMA_TOKEN_NULL) { if (decoder_start_token_id == -1) {
decoder_start_token_id = bos; decoder_start_token_id = bos;
} }
tmp.clear(); tmp.clear();
@ -1063,17 +974,17 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_perf_context_reset(lctx); llama_perf_context_reset(lctx);
} }
iparams.model.reset(model); iparams.model = model;
iparams.context.reset(lctx); iparams.context = lctx;
return iparams; return iparams;
} }
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) { void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
llama_clear_adapter_lora(ctx); llama_lora_adapter_clear(ctx);
for (auto & la : lora) { for (auto & la : lora_adapters) {
if (la.scale != 0.0f) { if (la.scale != 0.0f) {
llama_set_adapter_lora(ctx, la.ptr, la.scale); llama_lora_adapter_set(ctx, la.adapter, la.scale);
} }
} }
} }
@ -1087,6 +998,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (params.n_gpu_layers != -1) { if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers; mparams.n_gpu_layers = params.n_gpu_layers;
} }
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu; mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode; mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split; mparams.tensor_split = params.tensor_split;
@ -1103,6 +1015,38 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
return mparams; return mparams;
} }
static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "f32") {
return GGML_TYPE_F32;
}
if (s == "f16") {
return GGML_TYPE_F16;
}
if (s == "bf16") {
return GGML_TYPE_BF16;
}
if (s == "q8_0") {
return GGML_TYPE_Q8_0;
}
if (s == "q4_0") {
return GGML_TYPE_Q4_0;
}
if (s == "q4_1") {
return GGML_TYPE_Q4_1;
}
if (s == "iq4_nl") {
return GGML_TYPE_IQ4_NL;
}
if (s == "q5_0") {
return GGML_TYPE_Q5_0;
}
if (s == "q5_1") {
return GGML_TYPE_Q5_1;
}
throw std::runtime_error("Unsupported cache type: " + s);
}
struct llama_context_params common_context_params_to_llama(const common_params & params) { struct llama_context_params common_context_params_to_llama(const common_params & params) {
auto cparams = llama_context_default_params(); auto cparams = llama_context_default_params();
@ -1137,8 +1081,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
} }
cparams.type_k = params.cache_type_k; cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = params.cache_type_v; cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
return cparams; return cparams;
} }
@ -1164,7 +1108,13 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3 #define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2 #define CURL_RETRY_DELAY_SECONDS 2
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts; int remaining_attempts = max_attempts;
while (remaining_attempts > 0) { while (remaining_attempts > 0) {
@ -1188,9 +1138,9 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
} }
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
if (!curl) { if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__); LOG_ERR("%s: error initializing libcurl\n", __func__);
return false; return false;
@ -1204,9 +1154,11 @@ static bool common_download_file(const std::string & url, const std::string & pa
// Check if hf-token or bearer-token was specified // Check if hf-token or bearer-token was specified
if (!hf_token.empty()) { if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token; std::string auth_header = "Authorization: Bearer ";
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); auth_header += hf_token.c_str();
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); struct curl_slist *http_headers = NULL;
http_headers = curl_slist_append(http_headers, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
} }
#if defined(_WIN32) #if defined(_WIN32)
@ -1216,7 +1168,8 @@ static bool common_download_file(const std::string & url, const std::string & pa
#endif #endif
// Check if the file already exists locally // Check if the file already exists locally
auto file_exists = std::filesystem::exists(path); struct stat model_file_info;
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
// If the file exists, check its JSON metadata companion file. // If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json"; std::string metadata_path = path + ".json";
@ -1258,13 +1211,11 @@ static bool common_download_file(const std::string & url, const std::string & pa
std::string etag; std::string etag;
std::string last_modified; std::string last_modified;
}; };
common_load_model_from_url_headers headers; common_load_model_from_url_headers headers;
{ {
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase); static std::regex etag_regex("ETag", std::regex_constants::icase);
@ -1476,7 +1427,7 @@ struct llama_model * common_load_model_from_url(
} }
} }
return llama_model_load_from_file(local_path.c_str(), params); return llama_load_model_from_file(local_path.c_str(), params);
} }
struct llama_model * common_load_model_from_hf( struct llama_model * common_load_model_from_hf(
@ -1502,80 +1453,6 @@ struct llama_model * common_load_model_from_hf(
return common_load_model_from_url(model_url, local_path, hf_token, params); return common_load_model_from_url(model_url, local_path, hf_token, params);
} }
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// fetch model info from Hugging Face Hub API
json model_info;
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
}
long res_code;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
if (res_code == 200) {
model_info = json::parse(res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
if (!model_info.contains("ggufFile")) {
throw std::runtime_error("error: model does not have ggufFile");
}
json & gguf_file = model_info.at("ggufFile");
if (!gguf_file.contains("rfilename")) {
throw std::runtime_error("error: ggufFile does not have rfilename");
}
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
}
#else #else
struct llama_model * common_load_model_from_url( struct llama_model * common_load_model_from_url(
@ -1597,11 +1474,6 @@ struct llama_model * common_load_model_from_hf(
return nullptr; return nullptr;
} }
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return std::make_pair("", "");
}
#endif // LLAMA_USE_CURL #endif // LLAMA_USE_CURL
// //
@ -1700,23 +1572,21 @@ std::vector<llama_token> common_tokenize(
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
const llama_model * model = llama_get_model(ctx); return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_tokenize(vocab, text, add_special, parse_special);
} }
std::vector<llama_token> common_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_vocab * vocab, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
// upper limit for the number of tokens // upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special; int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens); std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) { if (n_tokens < 0) {
result.resize(-n_tokens); result.resize(-n_tokens);
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens); GGML_ASSERT(check == -n_tokens);
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);
@ -1725,18 +1595,12 @@ std::vector<llama_token> common_tokenize(
} }
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_token_to_piece(vocab, token, special);
}
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
std::string piece; std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) { if (n_chars < 0) {
piece.resize(-n_chars); piece.resize(-n_chars);
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars); GGML_ASSERT(check == -n_chars);
} }
else { else {
@ -1746,19 +1610,13 @@ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token
return piece; return piece;
} }
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) { std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_detokenize(vocab, tokens, special);
}
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
std::string text; std::string text;
text.resize(std::max(text.capacity(), tokens.size())); text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) { if (n_chars < 0) {
text.resize(-n_chars); text.resize(-n_chars);
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
} }
@ -1772,80 +1630,63 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
// Chat template utils // Chat template utils
// //
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) { bool common_chat_verify_template(const std::string & tmpl) {
if (use_jinja) {
try {
auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
common_chat_inputs inputs;
inputs.messages = json::array({{
{"role", "user"},
{"content", "test"},
}});
common_chat_params_init(chat_template, inputs);
return true;
} catch (const std::exception & e) {
LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
return false;
}
}
llama_chat_message chat[] = {{"user", "test"}}; llama_chat_message chat[] = {{"user", "test"}};
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0); int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0; return res >= 0;
} }
std::string common_chat_apply_template( std::string common_chat_apply_template(const struct llama_model * model,
const common_chat_template & tmpl, const std::string & tmpl,
const std::vector<common_chat_msg> & msgs, const std::vector<common_chat_msg> & msgs,
bool add_ass, bool add_ass) {
bool use_jinja) {
if (use_jinja) {
auto messages = json::array();
for (const auto & msg : msgs) {
messages.push_back({{"role", msg.role}, {"content", msg.content}});
}
common_chat_inputs inputs;
inputs.messages = messages;
inputs.add_generation_prompt = add_ass;
return common_chat_params_init(tmpl, inputs).prompt;
}
int alloc_size = 0; int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat; std::vector<llama_chat_message> chat;
for (const auto & msg : msgs) { for (auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()}); chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25; alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
} }
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
std::vector<char> buf(alloc_size); std::vector<char> buf(alloc_size);
// run the first time to get the total output length // run the first time to get the total output length
int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size()); int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// error: chat template is not supported // error: chat template is not supported
if (res < 0) { if (res < 0) {
// if the custom "tmpl" is not supported, we throw an error if (ptr_tmpl != nullptr) {
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() // if the custom "tmpl" is not supported, we throw an error
throw std::runtime_error("this custom template is not supported"); // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported");
} else {
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
}
} }
// if it turns out that our buffer is too small, we resize it // if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) { if ((size_t) res > buf.size()) {
buf.resize(res); buf.resize(res);
res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size()); res = llama_chat_apply_template(
fallback ? nullptr : model,
fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
} }
std::string formatted_chat(buf.data(), res); std::string formatted_chat(buf.data(), res);
return formatted_chat; return formatted_chat;
} }
std::string common_chat_format_single( std::string common_chat_format_single(const struct llama_model * model,
const common_chat_template & tmpl, const std::string & tmpl,
const std::vector<common_chat_msg> & past_msg, const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg, const common_chat_msg & new_msg,
bool add_ass, bool add_ass) {
bool use_jinja) {
std::ostringstream ss; std::ostringstream ss;
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja); auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
std::vector<common_chat_msg> chat_new(past_msg); std::vector<common_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version // if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@ -1853,87 +1694,21 @@ std::string common_chat_format_single(
}; };
// format chat with new_msg // format chat with new_msg
chat_new.push_back(new_msg); chat_new.push_back(new_msg);
auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja); auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
// get the diff part // get the diff part
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return ss.str(); return ss.str();
} }
std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) { std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl) {
std::vector<common_chat_msg> msgs = { std::vector<common_chat_msg> msgs = {
{"system", "You are a helpful assistant", {}}, {"system", "You are a helpful assistant"},
{"user", "Hello", {}}, {"user", "Hello"},
{"assistant", "Hi there", {}}, {"assistant", "Hi there"},
{"user", "How are you?", {}}, {"user", "How are you?"},
}; };
return common_chat_apply_template(tmpl, msgs, true, use_jinja); return common_chat_apply_template(model, tmpl, msgs, true);
}
#define CHATML_TEMPLATE_SRC \
"{%- for message in messages -%}\n" \
" {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
"{%- endfor -%}\n" \
"{%- if add_generation_prompt -%}\n" \
" {{- '<|im_start|>assistant\n' -}}\n" \
"{%- endif -%}"
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
{
std::string default_template_src;
std::string template_tool_use_src;
bool has_explicit_template = !chat_template_override.empty();
if (chat_template_override.empty()) {
auto str = llama_model_chat_template(model, /* name */ nullptr);
if (str) {
default_template_src = str;
has_explicit_template = true;
}
str = llama_model_chat_template(model, /* name */ "tool_use");
if (str) {
template_tool_use_src = str;
has_explicit_template = true;
}
} else {
default_template_src = chat_template_override;
}
if (default_template_src.empty() || default_template_src == "chatml") {
if (!template_tool_use_src.empty()) {
default_template_src = template_tool_use_src;
} else {
default_template_src = CHATML_TEMPLATE_SRC;
}
}
auto vocab = llama_model_get_vocab(model);
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
if (token == LLAMA_TOKEN_NULL) {
if (default_template_src.find(jinja_variable_name) != std::string::npos
|| template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
}
return std::string();
} else {
return common_token_to_piece(vocab, token, true);
}
};
auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
try {
return {
has_explicit_template,
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
template_tool_use_src.empty()
? nullptr
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
};
} catch (const std::exception & e) {
LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
return {
has_explicit_template,
std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
nullptr,
};
}
} }
// //
@ -2024,9 +1799,7 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
break; break;
case 0: // max absolute case 0: // max absolute
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (sum < std::abs(inp[i])) { if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
sum = std::abs(inp[i]);
}
} }
sum /= 32760.0; // make an int16 range sum /= 32760.0; // make an int16 range
break; break;

View file

@ -2,9 +2,8 @@
#pragma once #pragma once
#include "llama-cpp.h" #include "llama.h"
#include <set>
#include <string> #include <string>
#include <vector> #include <vector>
#include <sstream> #include <sstream>
@ -25,20 +24,22 @@
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct common_adapter_lora_info { struct common_lora_adapter_info {
std::string path; std::string path;
float scale; float scale;
};
struct llama_adapter_lora * ptr; struct common_lora_adapter_container : common_lora_adapter_info {
struct llama_lora_adapter * adapter;
}; };
using llama_tokens = std::vector<llama_token>; using llama_tokens = std::vector<llama_token>;
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern const char * LLAMA_COMMIT; extern char const * LLAMA_COMMIT;
extern const char * LLAMA_COMPILER; extern char const * LLAMA_COMPILER;
extern const char * LLAMA_BUILD_TARGET; extern char const * LLAMA_BUILD_TARGET;
struct common_control_vector_load_info; struct common_control_vector_load_info;
@ -79,7 +80,6 @@ enum llama_example {
LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };
@ -95,7 +95,6 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_TEMPERATURE = 7, COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
COMMON_SAMPLER_TYPE_XTC = 8, COMMON_SAMPLER_TYPE_XTC = 8,
COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
}; };
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
@ -104,17 +103,6 @@ enum dimre_method {
DIMRE_METHOD_MEAN, DIMRE_METHOD_MEAN,
}; };
enum common_conversation_mode {
COMMON_CONVERSATION_MODE_DISABLED = 0,
COMMON_CONVERSATION_MODE_ENABLED = 1,
COMMON_CONVERSATION_MODE_AUTO = 2,
};
struct common_grammar_trigger {
std::string word;
bool at_start;
};
// sampling parameters // sampling parameters
struct common_params_sampling { struct common_params_sampling {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@ -142,6 +130,7 @@ struct common_params_sampling {
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool timing_per_token = false; bool timing_per_token = false;
@ -150,7 +139,6 @@ struct common_params_sampling {
std::vector<enum common_sampler_type> samplers = { std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY, COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TYPICAL_P,
@ -160,11 +148,7 @@ struct common_params_sampling {
COMMON_SAMPLER_TYPE_TEMPERATURE, COMMON_SAMPLER_TYPE_TEMPERATURE,
}; };
std::string grammar; // optional BNF-like grammar to constrain sampling std::string grammar; // optional BNF-like grammar to constrain sampling
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
std::set<llama_token> preserved_tokens;
std::vector<llama_logit_bias> logit_bias; // logit biases to apply std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@ -174,7 +158,6 @@ struct common_params_sampling {
struct common_params_speculative { struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_ctx = 0; // draft context size int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@ -185,21 +168,7 @@ struct common_params_speculative {
struct cpu_params cpuparams; struct cpu_params cpuparams;
struct cpu_params cpuparams_batch; struct cpu_params cpuparams_batch;
std::string hf_repo = ""; // HF repo // NOLINT std::string model = ""; // draft model for speculative decoding // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // draft model for speculative decoding // NOLINT
std::string model_url = ""; // model url to download // NOLINT
};
struct common_params_vocoder {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
}; };
struct common_params { struct common_params {
@ -224,13 +193,11 @@ struct common_params {
float defrag_thold = 0.1f; // KV cache defragmentation threshold float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params // offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
struct cpu_params cpuparams; struct cpu_params cpuparams;
struct cpu_params cpuparams_batch; struct cpu_params cpuparams_batch;
@ -244,12 +211,11 @@ struct common_params {
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
struct common_params_sampling sampling; struct common_params_sampling sampling;
struct common_params_speculative speculative; struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
std::string model = ""; // model path // NOLINT std::string model = ""; // model path // NOLINT
std::string model_alias = ""; // model alias // NOLINT std::string model_alias = "unknown"; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT std::string hf_repo = ""; // HF repo // NOLINT
@ -262,13 +228,14 @@ struct common_params {
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -296,6 +263,7 @@ struct common_params {
bool special = false; // enable special token output bool special = false; // enable special token output
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool interactive_first = false; // wait for user input immediately bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@ -318,10 +286,8 @@ struct common_params {
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K std::string cache_type_k = "f16"; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V std::string cache_type_v = "f16"; // KV cache data type for the V
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT std::string mmproj = ""; // path to multimodal projector // NOLINT
@ -344,7 +310,6 @@ struct common_params {
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT std::string public_path = ""; // NOLINT
std::string chat_template = ""; // NOLINT std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true; bool enable_chat_template = true;
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
@ -439,10 +404,6 @@ std::string string_format(const char * fmt, ...);
std::string string_strip(const std::string & str); std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp(); std::string string_get_sortable_timestamp();
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
std::string string_repeat(const std::string & str, size_t n);
void string_replace_all(std::string & s, const std::string & search, const std::string & replace); void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
template<class T> template<class T>
@ -476,16 +437,6 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts; return parts;
} }
static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool string_ends_with(const std::string & str,
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
@ -508,12 +459,10 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils // Model utils
// //
// note: defines object's lifetime
struct common_init_result { struct common_init_result {
llama_model_ptr model; struct llama_model * model = nullptr;
llama_context_ptr context; struct llama_context * context = nullptr;
std::vector<common_lora_adapter_container> lora_adapters;
std::vector<llama_adapter_lora_ptr> lora;
}; };
struct common_init_result common_init_from_params(common_params & params); struct common_init_result common_init_from_params(common_params & params);
@ -527,7 +476,6 @@ struct llama_model * common_load_model_from_url(
const std::string & local_path, const std::string & local_path,
const std::string & hf_token, const std::string & hf_token,
const struct llama_model_params & params); const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf( struct llama_model * common_load_model_from_hf(
const std::string & repo, const std::string & repo,
const std::string & remote_path, const std::string & remote_path,
@ -535,12 +483,8 @@ struct llama_model * common_load_model_from_hf(
const std::string & hf_token, const std::string & hf_token,
const struct llama_model_params & params); const struct llama_model_params & params);
std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);
// clear LoRA adapters from context, then apply new list of adapters // clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora); void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
// //
// Batch utils // Batch utils
@ -578,7 +522,7 @@ std::vector<llama_token> common_tokenize(
bool parse_special = false); bool parse_special = false);
std::vector<llama_token> common_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_vocab * vocab, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false);
@ -590,21 +534,11 @@ std::string common_token_to_piece(
llama_token token, llama_token token,
bool special = true); bool special = true);
std::string common_token_to_piece(
const struct llama_vocab * vocab,
llama_token token,
bool special = true);
// detokenizes a vector of tokens into a string // detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode` // should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens // optionally renders special/control tokens
std::string common_detokenize( std::string common_detokenize(
const struct llama_context * ctx, llama_context * ctx,
const std::vector<llama_token> & tokens,
bool special = true);
std::string common_detokenize(
const struct llama_vocab * vocab,
const std::vector<llama_token> & tokens, const std::vector<llama_token> & tokens,
bool special = true); bool special = true);
@ -612,57 +546,33 @@ std::string common_detokenize(
// Chat template utils // Chat template utils
// //
struct common_tool_call {
std::string name;
std::string arguments;
std::string id;
};
// same with llama_chat_message, but uses std::string // same with llama_chat_message, but uses std::string
struct common_chat_msg { struct common_chat_msg {
std::string role; std::string role;
std::string content; std::string content;
std::vector<common_tool_call> tool_calls;
std::string tool_plan = "";
}; };
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); bool common_chat_verify_template(const std::string & tmpl);
namespace minja {
class chat_template;
}
typedef minja::chat_template common_chat_template;
struct common_chat_templates {
bool has_explicit_template; // Model had builtin template or template overridde was specified.
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
std::unique_ptr<common_chat_template> template_tool_use;
};
// CPP wrapper for llama_chat_apply_template // CPP wrapper for llama_chat_apply_template
// If the built-in template is not supported, we default to chatml // If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error // If the custom "tmpl" is not supported, we throw an error
std::string common_chat_apply_template( std::string common_chat_apply_template(const struct llama_model * model,
const common_chat_template & tmpl, const std::string & tmpl,
const std::vector<common_chat_msg> & chat, const std::vector<common_chat_msg> & chat,
bool add_ass, bool add_ass);
bool use_jinja);
// Format single message, while taking into account the position of that message in chat history // Format single message, while taking into account the position of that message in chat history
std::string common_chat_format_single( std::string common_chat_format_single(const struct llama_model * model,
const common_chat_template & tmpl, const std::string & tmpl,
const std::vector<common_chat_msg> & past_msg, const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg, const common_chat_msg & new_msg,
bool add_ass, bool add_ass);
bool use_jinja);
// Returns an example of formatted chat // Returns an example of formatted chat
std::string common_chat_format_example( std::string common_chat_format_example(const struct llama_model * model,
const common_chat_template & tmpl, bool use_jinja); const std::string & tmpl);
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
// //
// KV cache utils // KV cache utils
@ -678,8 +588,7 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
// Embedding utils // Embedding utils
// //
// TODO: repace embd_norm with an enum void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@ -708,10 +617,6 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
// Split utils // Split utils
// //
namespace { static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_NO = "split.no"; static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
}

View file

@ -1,6 +1,4 @@
#include "json-schema-to-grammar.h" #include "json-schema-to-grammar.h"
#include "common.h"
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <map> #include <map>
@ -13,6 +11,11 @@
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
template <typename Iterator>
static std::string join(Iterator begin, Iterator end, const std::string & separator);
static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max(); auto has_max = max_items != std::numeric_limits<int>::max();
@ -125,8 +128,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
if (sub_len > 0) { if (sub_len > 0) {
auto from_sub = from.substr(i + 1); auto from_sub = from.substr(i + 1);
auto to_sub = to.substr(i + 1); auto to_sub = to.substr(i + 1);
auto sub_zeros = string_repeat("0", sub_len); auto sub_zeros = repeat("0", sub_len);
auto sub_nines = string_repeat("9", sub_len); auto sub_nines = repeat("9", sub_len);
auto to_reached = false; auto to_reached = false;
out << "("; out << "(";
@ -185,8 +188,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
auto max_digits = max_s.length(); auto max_digits = max_s.length();
for (auto digits = min_digits; digits < max_digits; digits++) { for (auto digits = min_digits; digits < max_digits; digits++) {
uniform_range(min_s, string_repeat("9", digits)); uniform_range(min_s, repeat("9", digits));
min_s = "1" + string_repeat("0", digits); min_s = "1" + repeat("0", digits);
out << " | "; out << " | ";
} }
uniform_range(min_s, max_s); uniform_range(min_s, max_s);
@ -315,6 +318,49 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator>
std::string join(Iterator begin, Iterator end, const std::string & separator) {
std::ostringstream result;
if (begin != end) {
result << *begin;
for (Iterator it = begin + 1; it != end; ++it) {
result << separator << *it;
}
}
return result.str();
}
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
tokens.push_back(str.substr(start, end - start));
start = end + delimiter.length();
end = str.find(delimiter, start);
}
tokens.push_back(str.substr(start));
return tokens;
}
static std::string repeat(const std::string & str, size_t n) {
if (n == 0) {
return "";
}
std::string result;
result.reserve(str.length() * n);
for (size_t i = 0; i < n; ++i) {
result += str;
}
return result;
}
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) { static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
std::smatch match; std::smatch match;
std::string result; std::string result;
@ -343,7 +389,6 @@ static std::string format_literal(const std::string & literal) {
class SchemaConverter { class SchemaConverter {
private: private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json; std::function<json(const std::string &)> _fetch_json;
bool _dotall; bool _dotall;
std::map<std::string, std::string> _rules; std::map<std::string, std::string> _rules;
@ -373,7 +418,7 @@ private:
for (size_t i = 0; i < alt_schemas.size(); i++) { for (size_t i = 0; i < alt_schemas.size(); i++) {
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i))); rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
} }
return string_join(rules, " | "); return join(rules.begin(), rules.end(), " | ");
} }
std::string _visit_pattern(const std::string & pattern, const std::string & name) { std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@ -436,7 +481,7 @@ private:
for (const auto & item : ret) { for (const auto & item : ret) {
results.push_back(to_rule(item)); results.push_back(to_rule(item));
} }
return std::make_pair(string_join(results, " "), false); return std::make_pair(join(results.begin(), results.end(), " "), false);
}; };
while (i < length) { while (i < length) {
@ -494,7 +539,7 @@ private:
} }
curly_brackets += '}'; curly_brackets += '}';
i++; i++;
auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ","); auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
int min_times = 0; int min_times = 0;
int max_times = std::numeric_limits<int>::max(); int max_times = std::numeric_limits<int>::max();
try { try {
@ -764,11 +809,10 @@ private:
public: public:
SchemaConverter( SchemaConverter(
const std::function<json(const std::string &)> & fetch_json, const std::function<json(const std::string &)> & fetch_json,
bool dotall, bool dotall)
bool compact_spaces)
: _fetch_json(fetch_json), _dotall(dotall) : _fetch_json(fetch_json), _dotall(dotall)
{ {
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE; _rules["space"] = SPACE_RULE;
} }
void resolve_refs(json & schema, const std::string & url) { void resolve_refs(json & schema, const std::string & url) {
@ -810,7 +854,7 @@ public:
return; return;
} }
std::string pointer = ref.substr(ref.find('#') + 1); std::string pointer = ref.substr(ref.find('#') + 1);
std::vector<std::string> tokens = string_split(pointer, "/"); std::vector<std::string> tokens = split(pointer, "/");
for (size_t i = 1; i < tokens.size(); ++i) { for (size_t i = 1; i < tokens.size(); ++i) {
std::string sel = tokens[i]; std::string sel = tokens[i];
if (target.is_null() || !target.contains(sel)) { if (target.is_null() || !target.contains(sel)) {
@ -861,7 +905,7 @@ public:
for (const auto & v : schema["enum"]) { for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v)); enum_values.push_back(_generate_constant_rule(v));
} }
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space"); return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
} else if ((schema_type.is_null() || schema_type == "object") } else if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") || && (schema.contains("properties") ||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) { (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@ -975,10 +1019,10 @@ public:
void check_errors() { void check_errors() {
if (!_errors.empty()) { if (!_errors.empty()) {
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n")); throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
} }
if (!_warnings.empty()) { if (!_warnings.empty()) {
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str()); fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
} }
} }
@ -991,35 +1035,11 @@ public:
} }
}; };
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { std::string json_schema_to_grammar(const json & schema) {
#ifdef LLAMA_USE_LLGUIDANCE SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
if (!force_gbnf) { auto copy = schema;
return "%llguidance {}\nstart: %json " + schema.dump(); converter.resolve_refs(copy, "input");
} converter.visit(copy, "");
#else
(void)force_gbnf;
#endif // LLAMA_USE_LLGUIDANCE
return build_grammar([&](const common_grammar_builder & callbacks) {
auto copy = schema;
callbacks.resolve_refs(copy);
callbacks.add_schema("", copy);
});
}
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
common_grammar_builder builder {
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
return converter._add_rule(name, rule);
},
/* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
return converter.visit(schema, name == "root" ? "" : name);
},
/* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
converter.resolve_refs(schema, "");
}
};
cb(builder);
converter.check_errors(); converter.check_errors();
return converter.format_grammar(); return converter.format_grammar();
} }

View file

@ -5,18 +5,4 @@
#define JSON_ASSERT GGML_ASSERT #define JSON_ASSERT GGML_ASSERT
#include "json.hpp" #include "json.hpp"
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema, std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
bool force_gbnf = false);
struct common_grammar_builder {
std::function<std::string(const std::string &, const std::string &)> add_rule;
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
std::function<void(nlohmann::ordered_json &)> resolve_refs;
};
struct common_grammar_options {
bool dotall = false;
bool compact_spaces = false;
};
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});

View file

@ -1,270 +0,0 @@
#include "sampling.h"
#include "log.h"
#ifdef LLAMA_USE_LLGUIDANCE
# include "llguidance.h"
# include <cmath>
struct llama_sampler_llg {
const llama_vocab * vocab;
std::string grammar_kind;
std::string grammar_data;
LlgTokenizer * tokenizer;
LlgConstraint * grammar;
LlgMaskResult llg_res;
bool has_llg_res;
};
static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
const char * grammar_data) {
LlgConstraintInit cinit;
llg_constraint_init_set_defaults(&cinit, tokenizer);
const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
if (log_level && *log_level) {
cinit.log_stderr_level = atoi(log_level);
}
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
if (llg_get_error(c)) {
LOG_ERR("llg error: %s\n", llg_get_error(c));
llg_free_constraint(c);
return nullptr;
}
return c;
}
static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
return "llguidance";
}
static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
LlgCommitResult res;
llg_commit_token(ctx->grammar, token, &res);
ctx->has_llg_res = false;
}
}
static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
if (!ctx->has_llg_res) {
if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
ctx->has_llg_res = true;
} else {
LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
llg_free_constraint(ctx->grammar);
ctx->grammar = nullptr;
}
}
if (ctx->has_llg_res) {
if (ctx->llg_res.is_stop) {
for (size_t i = 0; i < cur_p->size; ++i) {
if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
cur_p->data[i].logit = -INFINITY;
}
}
} else {
const uint32_t * mask = ctx->llg_res.sample_mask;
for (size_t i = 0; i < cur_p->size; ++i) {
auto token = cur_p->data[i].id;
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
cur_p->data[i].logit = -INFINITY;
}
}
}
}
}
}
static void llama_sampler_llg_reset(llama_sampler * smpl) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (!ctx->grammar) {
return;
}
auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
llg_free_constraint(ctx->grammar);
ctx->grammar = grammar_new;
ctx->has_llg_res = false;
}
static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_llg *) smpl->ctx;
auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr);
// copy the state
{
auto * result_ctx = (llama_sampler_llg *) result->ctx;
if (ctx->grammar) {
result_ctx->grammar_kind = ctx->grammar_kind;
result_ctx->grammar_data = ctx->grammar_data;
result_ctx->grammar = llg_clone_constraint(ctx->grammar);
result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
}
}
return result;
}
static void llama_sampler_llg_free(llama_sampler * smpl) {
const auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
llg_free_constraint(ctx->grammar);
llg_free_tokenizer(ctx->tokenizer);
}
delete ctx;
}
static llama_sampler_i llama_sampler_llg_i = {
/* .name = */ llama_sampler_llg_name,
/* .accept = */ llama_sampler_llg_accept_impl,
/* .apply = */ llama_sampler_llg_apply,
/* .reset = */ llama_sampler_llg_reset,
/* .clone = */ llama_sampler_llg_clone,
/* .free = */ llama_sampler_llg_free,
};
static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
uint32_t * output_tokens, size_t output_tokens_len) {
const llama_vocab * vocab = (const llama_vocab *) user_data;
int r = 0;
try {
r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false,
true);
} catch (const std::exception & e) {
GGML_ABORT("llama_tokenize failed: %s\n", e.what());
}
if (r < 0) {
return -r;
}
return r;
}
static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) {
// TODO store the tokenizer in the vocab somehow
static const llama_vocab * vocab_cache;
static LlgTokenizer * tokenizer_cache;
if (vocab_cache == vocab) {
return llg_clone_tokenizer(tokenizer_cache);
}
auto tok_eos = llama_vocab_eot(vocab);
if (tok_eos == LLAMA_TOKEN_NULL) {
tok_eos = llama_vocab_eos(vocab);
}
size_t vocab_size = llama_vocab_n_tokens(vocab);
auto token_lens = new uint32_t[vocab_size];
// we typically have ~7 bytes per token; let's go on the safe side here
auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
auto token_bytes = new uint8_t[token_bytes_size];
size_t offset = 0;
for (size_t i = 0; i < vocab_size; i++) {
size_t max_token = 1024;
if (token_bytes_size - offset < max_token) {
GGML_ABORT("token_bytes buffer too small\n");
}
llama_token token = i;
auto dp = (char *) token_bytes + offset;
auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
if (size < 0) {
GGML_ABORT("llama_detokenize failed\n");
}
if (size == 0) {
size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
if (size < 0) {
GGML_ABORT("llama_detokenize failed\n");
}
if (size != 0) {
*dp = '\xff'; // special token prefix marker
size += 1;
}
}
token_lens[i] = size;
offset += size;
}
LlgTokenizerInit tinit = {
/* .vocab_size = */ (uint32_t) vocab_size,
/* .tok_eos = */ (uint32_t) tok_eos,
/* .token_lens = */ token_lens,
/* .token_bytes = */ token_bytes,
/* .tokenizer_json = */ nullptr,
/* .tokenize_assumes_string = */ true,
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
/* .use_approximate_greedy_tokenize_fn = */ false,
/* .tokenize_user_data = */ vocab,
};
char error_buffer[1024];
LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
delete[] token_bytes;
delete[] token_lens;
if (tokenizer == nullptr) {
LOG_ERR("llg tokenizer error: %s\n", error_buffer);
return tokenizer;
}
if (tokenizer_cache) {
llg_free_tokenizer(tokenizer_cache);
}
vocab_cache = vocab;
tokenizer_cache = tokenizer;
return llg_clone_tokenizer(tokenizer_cache);
}
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind,
const char * grammar_data) {
auto * ctx = new llama_sampler_llg;
if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
auto tokenizer = llama_sampler_llg_new_tokenizer(vocab);
*ctx = {
/* .vocab = */ vocab,
/* .grammar_kind = */ grammar_kind,
/* .grammar_data = */ grammar_data,
/* .tokenizer = */ tokenizer,
/* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
/* .llg_res = */ {},
/* .has_llg_res = */ false,
};
} else {
*ctx = {
/* .vocab = */ vocab,
/* .grammar_kind = */ {},
/* .grammar_data = */ {},
/* .tokenizer = */ nullptr,
/* .grammar = */ nullptr,
/* .llg_res = */ {},
/* .has_llg_res = */ false,
};
}
return llama_sampler_init(
/* .iface = */ &llama_sampler_llg_i,
/* .ctx = */ ctx
);
}
#else
llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) {
LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
return nullptr;
}
#endif // LLAMA_USE_LLGUIDANCE

View file

@ -14,6 +14,16 @@ void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity; common_log_verbosity_thold = verbosity;
} }
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
static int64_t t_us() { static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count(); return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
} }
@ -196,7 +206,6 @@ public:
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy); vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
} }
#endif #endif
va_end(args_copy);
} }
entry.level = level; entry.level = level;

View file

@ -2,17 +2,6 @@
#include "ggml.h" // for ggml_log_level #include "ggml.h" // for ggml_log_level
#define LOG_CLR_TO_EOL "\033[K\r"
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
#ifndef __GNUC__ #ifndef __GNUC__
# define LOG_ATTRIBUTE_FORMAT(...) # define LOG_ATTRIBUTE_FORMAT(...)
#elif defined(__MINGW32__) #elif defined(__MINGW32__)

File diff suppressed because it is too large Load diff

View file

@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) { if (part_static_it == nc_static.end()) {
return LLAMA_TOKEN_NULL; return -1;
} }
const common_ngram_cache_part part_static = part_static_it->second; const common_ngram_cache_part part_static = part_static_it->second;
int max_count_static = 0; int max_count_static = 0;
int sum_count_static = 0; int sum_count_static = 0;
llama_token max_token = LLAMA_TOKEN_NULL; llama_token max_token = -1;
for (std::pair<llama_token, int> token_count_static : part_static) { for (std::pair<llama_token, int> token_count_static : part_static) {
const llama_token token = token_count_static.first; const llama_token token = token_count_static.first;
@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
} }
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
return LLAMA_TOKEN_NULL; return -1;
} }
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
return LLAMA_TOKEN_NULL; return -1;
} }
return max_token; return max_token;
} }
@ -98,9 +98,9 @@ static llama_token try_draft(
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static, common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) { const int * min_sample_size, const int * min_percent) {
llama_token drafted_token = LLAMA_TOKEN_NULL; llama_token drafted_token = -1;
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) { for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
const common_ngram ngram_primary = ngrams_primary[i]; const common_ngram ngram_primary = ngrams_primary[i];
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
@ -112,7 +112,7 @@ static llama_token try_draft(
int max_count_primary = 0; int max_count_primary = 0;
int max_count_static = 0; int max_count_static = 0;
int sum_count_primary = 0; int sum_count_primary = 0;
llama_token max_token = LLAMA_TOKEN_NULL; llama_token max_token = -1;
for (std::pair<llama_token, int> token_count_primary : part_primary) { for (std::pair<llama_token, int> token_count_primary : part_primary) {
const llama_token token = token_count_primary.first; const llama_token token = token_count_primary.first;
@ -154,7 +154,7 @@ void common_ngram_cache_draft(
} }
while ((int) draft.size()-1 < n_draft) { while ((int) draft.size()-1 < n_draft) {
llama_token drafted_token = LLAMA_TOKEN_NULL; llama_token drafted_token = -1;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
common_ngram ngram_static; common_ngram ngram_static;
@ -177,17 +177,17 @@ void common_ngram_cache_draft(
} }
ngrams_cd.push_back(ngram_cd); ngrams_cd.push_back(ngram_cd);
} }
if (drafted_token == LLAMA_TOKEN_NULL) { if (drafted_token == -1) {
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
} }
if (drafted_token == LLAMA_TOKEN_NULL) { if (drafted_token == -1) {
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
} }
if (drafted_token == LLAMA_TOKEN_NULL) { if (drafted_token == -1) {
drafted_token = try_draft(nc_static, ngram_static); drafted_token = try_draft(nc_static, ngram_static);
} }
if (drafted_token == LLAMA_TOKEN_NULL) { if (drafted_token == -1) {
break; break;
} }

View file

@ -17,13 +17,13 @@ struct common_ngram {
common_ngram() { common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = LLAMA_TOKEN_NULL; tokens[i] = -1;
} }
} }
common_ngram(const llama_token * input, const int ngram_size) { common_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL; tokens[i] = i < ngram_size ? input[i] : -1;
} }
} }

View file

@ -113,10 +113,7 @@ struct common_sampler {
void set_logits(struct llama_context * ctx, int idx) { void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx); const auto * logits = llama_get_logits_ith(ctx, idx);
const llama_model * model = llama_get_model(ctx); const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
cur.resize(n_vocab); cur.resize(n_vocab);
@ -145,36 +142,13 @@ std::string common_params_sampling::print() const {
} }
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) { struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf; lparams.no_perf = params.no_perf;
std::vector<const char *> trigger_words;
trigger_words.reserve(params.grammar_trigger_words.size());
for (const auto & str : params.grammar_trigger_words) {
trigger_words.push_back(str.word.c_str());
}
struct llama_sampler * grmr;
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
#ifdef LLAMA_USE_LLGUIDANCE
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
#else
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE
} else {
grmr = params.grammar_lazy
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
trigger_words.data(), trigger_words.size(),
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
}
auto * result = new common_sampler { auto * result = new common_sampler {
/* .params = */ params, /* .params = */ params,
/* .grmr = */ grmr, /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams), /* .chain = */ llama_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)), /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {}, /* .cur = */ {},
@ -183,24 +157,36 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias( llama_sampler_init_logit_bias(
llama_vocab_n_tokens(vocab), llama_n_vocab(model),
params.logit_bias.size(), params.logit_bias.size(),
params.logit_bias.data())); params.logit_bias.data()));
llama_sampler_chain_add(result->chain,
llama_sampler_init_penalties(
llama_n_vocab (model),
llama_token_eos(model),
llama_token_nl (model),
params.penalty_last_n,
params.penalty_repeat,
params.penalty_freq,
params.penalty_present,
params.penalize_nl,
params.ignore_eos));
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) { for (const auto & cnstr : params.samplers) {
switch (cnstr) { switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY: case COMMON_SAMPLER_TYPE_DRY:
{ {
std::vector<const char *> c_breakers; std::vector<const char*> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size()); c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto & str : params.dry_sequence_breakers) { for (const auto& str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str()); c_breakers.push_back(str.c_str());
} }
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
} }
break; break;
case COMMON_SAMPLER_TYPE_TOP_K: case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break; break;
@ -220,10 +206,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break; break;
case COMMON_SAMPLER_TYPE_INFILL: case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break; break;
default: default:
GGML_ASSERT(false && "unknown sampler type"); GGML_ASSERT(false && "unknown sampler type");
@ -232,7 +215,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) { } else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) { } else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
@ -432,7 +415,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_XTC: return 'x';
case COMMON_SAMPLER_TYPE_INFILL: return 'i'; case COMMON_SAMPLER_TYPE_INFILL: return 'i';
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
default : return '?'; default : return '?';
} }
} }
@ -447,7 +429,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_XTC: return "xtc";
case COMMON_SAMPLER_TYPE_INFILL: return "infill"; case COMMON_SAMPLER_TYPE_INFILL: return "infill";
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
default : return ""; default : return "";
} }
} }
@ -462,7 +443,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "xtc", COMMON_SAMPLER_TYPE_XTC }, { "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "infill", COMMON_SAMPLER_TYPE_INFILL }, { "infill", COMMON_SAMPLER_TYPE_INFILL },
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
}; };
// since samplers names are written multiple ways // since samplers names are written multiple ways
@ -509,7 +489,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
}; };
std::vector<common_sampler_type> samplers; std::vector<common_sampler_type> samplers;

View file

@ -102,6 +102,3 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names); std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars); std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
const char * grammar_kind, const char * grammar_data);

View file

@ -62,10 +62,6 @@ struct common_speculative * common_speculative_init(
} }
void common_speculative_free(struct common_speculative * spec) { void common_speculative_free(struct common_speculative * spec) {
if (spec == nullptr) {
return;
}
common_sampler_free(spec->smpl); common_sampler_free(spec->smpl);
llama_batch_free(spec->batch); llama_batch_free(spec->batch);
@ -79,13 +75,10 @@ bool common_speculative_are_compatible(
const struct llama_model * model_tgt = llama_get_model(ctx_tgt); const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
const struct llama_model * model_dft = llama_get_model(ctx_dft); const struct llama_model * model_dft = llama_get_model(ctx_dft);
const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const bool vocab_type_tgt = llama_vocab_type(model_tgt);
const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
const bool vocab_type_dft = llama_vocab_type(vocab_dft); const bool vocab_type_dft = llama_vocab_type(model_dft);
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) { if (vocab_type_tgt != vocab_type_dft) {
@ -94,34 +87,34 @@ bool common_speculative_are_compatible(
return false; return false;
} }
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) || if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) || llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) || llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) { llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__); LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt)); LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft)); LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
return false; return false;
} }
{ {
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt); const int n_vocab_tgt = llama_n_vocab(model_tgt);
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft); const int n_vocab_dft = llama_n_vocab(model_dft);
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
__func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return false; return false;
} }
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i); const char * token_text_tgt = llama_token_get_text(model_tgt, i);
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i); const char * token_text_dft = llama_token_get_text(model_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) { if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but " LOG_ERR("%s: draft model vocab must match target model to use speculation but "
"token %d content differs - target '%s', draft '%s'\n", __func__, i, "token %d content differs - target '%s', draft '%s'\n", __func__, i,
common_token_to_piece(ctx_tgt, i).c_str(), common_token_to_piece(ctx_tgt, i).c_str(),
common_token_to_piece(ctx_dft, i).c_str()); common_token_to_piece(ctx_dft, i).c_str());

File diff suppressed because it is too large Load diff

View file

@ -17,7 +17,7 @@
# #
# python3 convert_hf_to_gguf_update.py <huggingface_token> # python3 convert_hf_to_gguf_update.py <huggingface_token>
# #
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
# - Update llama.cpp with the new pre-tokenizer if necessary # - Update llama.cpp with the new pre-tokenizer if necessary
# #
# TODO: generate tokenizer tests for llama.cpp # TODO: generate tokenizer tests for llama.cpp
@ -65,50 +65,43 @@ else:
# TODO: add models here, base models preferred # TODO: add models here, base models preferred
models = [ models = [
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, {"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, {"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", }, {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
] ]

View file

@ -226,9 +226,6 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "") base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight")
# models produced by mergekit-extract-lora have token embeddings in the adapter
base_name = base_name.replace(".lora_embedding_A", ".weight")
base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name return base_name
@ -263,10 +260,6 @@ def parse_args() -> argparse.Namespace:
"--base", type=Path, "--base", type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
) )
parser.add_argument(
"--base-model-id", type=str,
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
)
parser.add_argument( parser.add_argument(
"lora_path", type=Path, "lora_path", type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@ -297,7 +290,6 @@ if __name__ == '__main__':
dir_base_model: Path | None = args.base dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path dir_lora: Path = args.lora_path
base_model_id: str | None = args.base_model_id
lora_config = dir_lora / "adapter_config.json" lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors" input_model = dir_lora / "adapter_model.safetensors"
@ -321,10 +313,7 @@ if __name__ == '__main__':
lparams: dict[str, Any] = json.load(f) lparams: dict[str, Any] = json.load(f)
# load base model # load base model
if base_model_id is not None: if dir_base_model is None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams: if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"] model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}") logger.info(f"Loading base model from Hugging Face: {model_id}")
@ -382,16 +371,11 @@ if __name__ == '__main__':
if self.lazy: if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor) tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name) base_name = get_base_tensor_name(name)
# note: mergekit-extract-lora also adds token embeddings to the adapter is_lora_a = ".lora_A.weight" in name
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_b = ".lora_B.weight" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
if not is_lora_a and not is_lora_b: if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name: if ".base_layer.weight" in name:
continue continue
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
if "_layernorm" in name or ".norm" in name:
yield (base_name, tensor)
continue
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
if ".embed_tokens.weight" in name or ".lm_head.weight" in name: if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
@ -423,21 +407,9 @@ if __name__ == '__main__':
if name == "lm_head.weight" and len(dest) == 0: if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model") raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
# mergekit-extract-lora add these layernorm to the adapter
if "_norm" in dest_name:
assert dest_data.dim() == 1
yield (dest_name, dest_data)
continue
# otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()
# note: mergekit-extract-lora flip and transpose A and B
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
if "token_embd.weight" in dest_name:
lora_a = lora_a.T
yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b) yield (dest_name + ".lora_b", lora_b)

View file

@ -133,7 +133,7 @@ The docker build option is currently limited to *intel GPU* targets.
### Build image ### Build image
```sh ```sh
# Using FP16 # Using FP16
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile . docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
``` ```
*Notes*: *Notes*:

View file

@ -39,11 +39,6 @@ cmake --build build --config Release
``` ```
For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html). For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
```
cmake -B build -DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release
```
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
@ -55,14 +50,7 @@ cmake --build build --config Release
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release cmake --build build-arm64-windows-llvm-release
``` ```
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels. Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
For building with ninja generator and clang compiler as default:
-set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
```bash
cmake --preset x64-windows-llvm-release
cmake --build build-x64-windows-llvm-release
```
## BLAS Build ## BLAS Build
@ -125,66 +113,19 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
## CUDA ## CUDA
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed. This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
#### Download directly from NVIDIA - Using `CMake`:
You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
```bash
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
```
#### Compile and run inside a Fedora Toolbox Container The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
**Recommended for:**
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
### Compilation
```bash
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
```
### Override Compute Capability Specifications
If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
```text
nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
```
To override the `native` GPU detection:
#### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
```text
GeForce RTX 4090 8.9
GeForce RTX 3080 Ti 8.6
GeForce RTX 3070 8.6
```
#### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list.
```bash
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
```
### Runtime CUDA environmental variables
You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
```bash
# Use `CUDA_VISIBLE_DEVICES` to hide the first compute device.
CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
```
### Unified Memory
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`. The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
### Performance Tuning
The following compilation options are also available to tweak performance: The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |
@ -331,7 +272,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
```sh ```sh
# Build the image # Build the image
docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile . docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
# Then, use it: # Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33

View file

@ -1,270 +0,0 @@
# Setting Up CUDA on Fedora
In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for:
- [Fedora Workstation](https://fedoraproject.org/workstation/)
- [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/)
- [Fedora Spins](https://fedoraproject.org/spins)
- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.5`, `Arch Linux`, and `Ubuntu`.
## Table of Contents
- [Prerequisites](#prerequisites)
- [Using the Fedora 41 CUDA Repository](#using-the-fedora-41-cuda-repository)
- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
- [Installing Essential Development Tools](#installing-essential-development-tools)
- [Adding the CUDA Repository](#adding-the-cuda-repository)
- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
- [Configuring the Environment](#configuring-the-environment)
- [Verifying the Installation](#verifying-the-installation)
- [Conclusion](#conclusion)
- [Troubleshooting](#troubleshooting)
- [Additional Notes](#additional-notes)
- [References](#references)
## Prerequisites
- **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/).
- **NVIDIA Drivers and Graphics Card installed on Host System (recommended)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
- **Internet connectivity** to download packages.
### Using the Fedora 41 CUDA Repository
The latest release is 41.
- [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/)
**Note:** We recommend using a toolbox environment to prevent system conflicts.
## Creating a Fedora Toolbox Environment
This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using the Fedora Toolbox allows us to install the necessary packages without affecting the host system.
**Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker.
1. **Create a Fedora 41 Toolbox:**
```bash
toolbox create --image registry.fedoraproject.org/fedora-toolbox:41 --container fedora-toolbox-41-cuda
```
2. **Enter the Toolbox:**
```bash
toolbox enter --container fedora-toolbox-41-cuda
```
Inside the toolbox, you have root privileges and can install packages without affecting the host system.
## Installing Essential Development Tools
1. **Synchronize the DNF Package Manager:**
```bash
sudo dnf distro-sync
```
2. **Install the Default Text Editor (Optional):**
```bash
sudo dnf install vim-default-editor --allowerasing
```
The `--allowerasing` flag will allow the removal of the conflicting `nano-default-editor` package.
3. **Install Development Tools and Libraries:**
```bash
sudo dnf install @c-development @development-tools cmake
```
This installs essential packages for compiling software, including `gcc`, `make`, and other development headers.
## Adding the CUDA Repository
Add the NVIDIA CUDA repository to your DNF configuration:
```bash
sudo dnf config-manager addrepo --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/cuda-fedora41.repo
```
After adding the repository, synchronize the package manager again:
```bash
sudo dnf distro-sync
```
## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
```bash
ls -la /usr/lib64/libcuda.so.1
```
**Explanation:**
- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
```bash
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
```
### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
```bash
sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
```
#### 2. Update the RPM database to assume the installation of these packages.
```bash
sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
```
**Note:**
- The `--justdb` option only updates the RPM database, without touching the filesystem.
#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
After manually installing the dependencies, run:
```bash
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
```
You should receive a message indicating the package is already installed:
```
Updating and loading repositories:
Repositories loaded.
Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Nothing to do.
```
## Installing the CUDA Meta-Package
Now that the driver libraries are installed, proceed to install CUDA:
```bash
sudo dnf install cuda
```
This installs the CUDA toolkit and associated packages.
## Configuring the Environment
To use CUDA, add its binary directory to your system's `PATH`.
1. **Create a Profile Script:**
```bash
sudo sh -c 'echo "export PATH=\$PATH:/usr/local/cuda/bin" >> /etc/profile.d/cuda.sh'
```
**Explanation:**
- We add to `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system.
- The backslash `\` before `$PATH` ensures the variable is correctly written into the script.
2. **Make the Script Executable:**
```bash
sudo chmod +x /etc/profile.d/cuda.sh
```
3. **Source the Script to Update Your Environment:**
```bash
source /etc/profile.d/cuda.sh
```
**Note:** This command updates your current shell session with the new `PATH`. The `/etc/profile.d/cuda.sh` script ensures that the CUDA binaries are available in your `PATH` for all future sessions.
## Verifying the Installation
To confirm that CUDA is correctly installed and configured, check the version of the NVIDIA CUDA Compiler (`nvcc`):
```bash
nvcc --version
```
You should see output similar to:
```
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Wed_Jan_15_19:20:09_PST_2025
Cuda compilation tools, release 12.8, V12.8.61
Build cuda_12.8.r12.8/compiler.35404655_0
```
This output confirms that the CUDA compiler is accessible and indicates the installed version.
## Conclusion
You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 41 CUDA repository. By manually updating the RPM db and configuring the environment, you can develop CUDA applications without affecting your host system.
## Troubleshooting
- **Installation Failures:**
- If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
- You may use the `--excludepath` option with `rpm` to exclude conflicting files during manual RPM installations.
- **Rebooting the Container:**
- Sometimes there may be a bug in the NVIDIA driver host passthrough (such as missing a shared library). Rebooting the container may solve this issue:
```bash
# on the host system
podman container restart --all
```
- **Environment Variables Not Set:**
- If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`.
- Run `echo $PATH` to check if the path is included.
- Re-source the profile script or open a new terminal session.
## Additional Notes
- **Updating CUDA in the Future:**
- Keep an eye on the official NVIDIA repositories for updates to your Fedora version.
- When an updated repository becomes available, adjust your `dnf` configuration accordingly.
- **Building `llama.cpp`:**
- With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
- Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.
- **Using the Toolbox Environment:**
- The toolbox environment is isolated from your host system, which helps prevent conflicts.
- Remember that system files and configurations inside the toolbox are separate from the host. By default the home directory of the user is shared between the host and the toolbox.
---
**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox.
**Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide.
## References
- [Fedora Toolbox Documentation](https://docs.fedoraproject.org/en-US/fedora-silverblue/toolbox/)
- [NVIDIA CUDA Installation Guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
- [Podman Documentation](https://podman.io/get-started)
---

View file

@ -28,7 +28,7 @@ The required steps to implement for an HF model are:
```python ```python
@Model.register("MyModelForCausalLM") @Model.register("MyModelForCausalLM")
class MyModel(Model): class MyModel(Model):
model_arch = gguf.MODEL_ARCH.MYMODEL model_arch = gguf.MODEL_ARCH.GROK
``` ```
2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py) 2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
@ -79,14 +79,14 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi
- `Model#set_vocab` - `Model#set_vocab`
- `Model#write_tensors` - `Model#write_tensors`
NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights. NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
### 2. Define the model architecture in `llama.cpp` ### 2. Define the model architecture in `llama.cpp`
The model params and tensors layout must be defined in `llama.cpp`: The model params and tensors layout must be defined in `llama.cpp`:
1. Define a new `llm_arch` 1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES` 2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non-standard metadata in `llm_load_hparams` 3. Add any non standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors` 4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type` 5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`. Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR. When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/). Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).

View file

@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
## Building Docker locally ## Building Docker locally
```bash ```bash
docker build -t local/llama.cpp:full-cuda --target full -f .devops/cuda.Dockerfile . docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda --target light -f .devops/cuda.Dockerfile . docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile . docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
``` ```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
## Building Docker locally ## Building Docker locally
```bash ```bash
docker build -t local/llama.cpp:full-musa --target full -f .devops/musa.Dockerfile . docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
docker build -t local/llama.cpp:light-musa --target light -f .devops/musa.Dockerfile . docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
docker build -t local/llama.cpp:server-musa --target server -f .devops/musa.Dockerfile . docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
``` ```
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture. You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.

View file

@ -1,51 +0,0 @@
# LLGuidance Support in llama.cpp
[LLGuidance](https://github.com/guidance-ai/llguidance) is a library for constrained decoding (also called constrained sampling or structured outputs) for Large Language Models (LLMs). Initially developed as the backend for the [Guidance](https://github.com/guidance-ai/guidance) library, it can also be used independently.
LLGuidance supports JSON Schemas and arbitrary context-free grammars (CFGs) written in a [variant](https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md) of Lark syntax. It is [very fast](https://github.com/guidance-ai/jsonschemabench/tree/main/maskbench) and has [excellent](https://github.com/guidance-ai/llguidance/blob/main/docs/json_schema.md) JSON Schema coverage but requires the Rust compiler, which complicates the llama.cpp build process.
## Building
To enable LLGuidance support, build llama.cpp with the `LLAMA_LLGUIDANCE` option:
```sh
cmake -B build -DLLAMA_LLGUIDANCE=ON
make -C build -j
```
This requires the Rust compiler and the `cargo` tool to be [installed](https://www.rust-lang.org/tools/install).
## Interface
There are no new command-line arguments or modifications to `common_params`. When enabled, grammars starting with `%llguidance` are passed to LLGuidance instead of the [current](../grammars/README.md) llama.cpp grammars. Additionally, JSON Schema requests (e.g., using the `-j` argument in `llama-cli`) are also passed to LLGuidance.
For your existing GBNF grammars, you can use [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) to convert them to LLGuidance Lark-like format.
## Performance
Computing a "token mask" (i.e., the set of allowed tokens) for a llama3 tokenizer with 128k tokens takes, on average, 50μs of single-core CPU time for the [JSON Schema Bench](https://github.com/guidance-ai/jsonschemabench). The p99 time is 0.5ms, and the p100 time is 20ms. These results are due to the lexer/parser split and several [optimizations](https://github.com/guidance-ai/llguidance/blob/main/docs/optimizations.md).
## JSON Schema
LLGuidance adheres closely to the JSON Schema specification. For example:
- `additionalProperties` defaults to `true`, unlike current grammars, though you can set `"additionalProperties": false` if needed.
- any whitespace is allowed.
- The definition order in the `"properties": {}` object is maintained, regardless of whether properties are required (current grammars always puts required properties first).
Unsupported schemas result in an error message—no keywords are silently ignored.
## Why Not Reuse GBNF Format?
GBNF lacks the concept of a lexer.
Most programming languages, including JSON, use a two-step process: a lexer (built with regular expressions) converts a byte stream into lexemes, which are then processed by a CFG parser. This approach is faster because lexers are cheaper to evaluate, and there is ~10x fewer lexemes than bytes.
LLM tokens often align with lexemes, so the parser is engaged in under 0.5% of tokens, with the lexer handling the rest.
However, the user has to provide the distinction between lexemes and CFG symbols. In [Lark](https://github.com/lark-parser/lark), lexeme names are uppercase, while CFG symbols are lowercase.
The [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) can often take care of this automatically.
See [LLGuidance syntax docs](https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#terminals-vs-rules) for more details.
## Error Handling
Errors are currently printed to `stderr`, and generation continues. Improved error handling may be added in the future.

View file

@ -20,12 +20,7 @@ else()
add_subdirectory(batched) add_subdirectory(batched)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(eval-callback) add_subdirectory(eval-callback)
add_subdirectory(gbnf-validator)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(gbnf-validator)
endif()
add_subdirectory(gguf-hash) add_subdirectory(gguf-hash)
add_subdirectory(gguf-split) add_subdirectory(gguf-split)
add_subdirectory(gguf) add_subdirectory(gguf)
@ -51,17 +46,12 @@ else()
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(speculative-simple) add_subdirectory(speculative-simple)
add_subdirectory(tokenize) add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
if (NOT GGML_BACKEND_DL) if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading # these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator) add_subdirectory(cvector-generator)
add_subdirectory(export-lora) add_subdirectory(export-lora)
if (NOT WIN32) add_subdirectory(quantize-stats)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(quantize-stats)
endif()
add_subdirectory(llava) add_subdirectory(llava)
if (GGML_RPC) if (GGML_RPC)
add_subdirectory(rpc) add_subdirectory(rpc)

View file

@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params); llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@ -50,7 +50,7 @@ int main(int argc, char ** argv) {
// ensure enough sequences are available // ensure enough sequences are available
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
llama_context * ctx = llama_init_from_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
llama_batch_free(batch); llama_batch_free(batch);
llama_free(ctx); llama_free(ctx);
llama_model_free(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();

View file

@ -23,17 +23,12 @@ defer {
} }
let model_params = llama_model_default_params() let model_params = llama_model_default_params()
guard let model = llama_model_load_from_file(modelPath.cString(using: .utf8), model_params) else { guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
print("Failed to load model") print("Failed to load model")
exit(1) exit(1)
} }
defer { defer {
llama_model_free(model) llama_free_model(model)
}
guard let vocab = llama_model_get_vocab(model) else {
print("Failed to get vocab")
exit(1)
} }
var tokens = tokenize(text: prompt, add_bos: true) var tokens = tokenize(text: prompt, add_bos: true)
@ -46,7 +41,7 @@ context_params.n_batch = UInt32(max(n_len, n_parallel))
context_params.n_threads = 8 context_params.n_threads = 8
context_params.n_threads_batch = 8 context_params.n_threads_batch = 8
let context = llama_init_from_model(model, context_params) let context = llama_new_context_with_model(model, context_params)
guard context != nil else { guard context != nil else {
print("Failed to initialize context") print("Failed to initialize context")
exit(1) exit(1)
@ -146,7 +141,7 @@ while n_cur <= n_len {
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
// is it an end of stream? -> mark the stream as finished // is it an end of stream? -> mark the stream as finished
if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len { if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
i_batch[i] = -1 i_batch[i] = -1
// print("") // print("")
if n_parallel > 1 { if n_parallel > 1 {
@ -212,7 +207,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0) let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens) let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
var swiftTokens: [llama_token] = [] var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount { for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)]) swiftTokens.append(tokens[Int(i)])
@ -223,12 +218,12 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8) var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(vocab, token, &result, Int32(result.count), 0, false) let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
if nTokens < 0 { if nTokens < 0 {
let actualTokensCount = -Int(nTokens) let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount) result = .init(repeating: 0, count: actualTokensCount)
let check = llama_token_to_piece( let check = llama_token_to_piece(
vocab, model,
token, token,
&result, &result,
Int32(result.count), Int32(result.count),

View file

@ -41,19 +41,17 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params); llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__); LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = common_tokenize(vocab, params.prompt, true); tokens_list = common_tokenize(model, params.prompt, true);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
@ -64,10 +62,9 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = n_kv_req; ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_predict, n_parallel); ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_init_from_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams); llama_sampler * smpl = llama_sampler_chain_init(sparams);
@ -122,8 +119,8 @@ int main(int argc, char ** argv) {
} }
llama_token decoder_start_token_id = llama_model_decoder_start_token(model); llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == LLAMA_TOKEN_NULL) { if (decoder_start_token_id == -1) {
decoder_start_token_id = llama_vocab_bos(vocab); decoder_start_token_id = llama_token_bos(model);
} }
common_batch_clear(batch); common_batch_clear(batch);
@ -176,7 +173,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
// is it an end of generation? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1; i_batch[i] = -1;
LOG("\n"); LOG("\n");
if (n_parallel > 1) { if (n_parallel > 1) {
@ -238,7 +235,7 @@ int main(int argc, char ** argv) {
llama_sampler_free(smpl); llama_sampler_free(smpl);
llama_free(ctx); llama_free(ctx);
llama_model_free(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();

View file

@ -1,6 +1,4 @@
#include "ggml.h" #include "ggml.h"
#include "gguf.h"
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
@ -436,12 +434,12 @@ static void print_matrix(struct ggml_tensor * probs) {
} }
} }
struct my_llama_file { struct llama_file {
// use FILE * so we don't have to re-open the file to mmap // use FILE * so we don't have to re-open the file to mmap
FILE * fp; FILE * fp;
size_t size; size_t size;
my_llama_file(const char * fname, const char * mode) { llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode); fp = std::fopen(fname, mode);
if (fp == NULL) { if (fp == NULL) {
size = 0; size = 0;
@ -502,7 +500,7 @@ struct my_llama_file {
return std::string(chars.data(), len); return std::string(chars.data(), len);
} }
~my_llama_file() { ~llama_file() {
if (fp) { if (fp) {
std::fclose(fp); std::fclose(fp);
} }
@ -510,7 +508,7 @@ struct my_llama_file {
}; };
static bool is_ggml_file(const char * filename) { static bool is_ggml_file(const char * filename) {
my_llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (file.size < 4) { if (file.size < 4) {
return false; return false;
} }
@ -578,7 +576,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
} else { } else {
// assume llama2.c vocabulary // assume llama2.c vocabulary
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
my_llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (!file.fp) { if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename); die_fmt("%s: %s", strerror(errno), filename);
} }
@ -691,8 +689,8 @@ static void save_as_llama_model(
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL); gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL); gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
@ -911,7 +909,7 @@ int main(int argc, char ** argv) {
load_vocab(params.fn_vocab_model, &config, &vocab); load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model; struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx); model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx; model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd; model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim; model.hparams.n_ff = config.hidden_dim;

View file

@ -1,9 +1,7 @@
#include "ggml.h"
#include "gguf.h"
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include "pca.hpp" #include "pca.hpp"
#include "mean.hpp" #include "mean.hpp"
@ -273,9 +271,7 @@ struct tokenized_prompt {
size_t max_seq_len; size_t max_seq_len;
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const llama_model * model = llama_get_model(ctx); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
tokens_pos = common_tokenize(ctx, pos, add_bos, true); tokens_pos = common_tokenize(ctx, pos, add_bos, true);
tokens_neg = common_tokenize(ctx, neg, add_bos, true); tokens_neg = common_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@ -419,13 +415,12 @@ int main(int argc, char ** argv) {
// load the model to get hparams // load the model to get hparams
common_init_result llama_init = common_init_from_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context.get(); llama_context * ctx = llama_init.context;
// int n_ctx = llama_n_ctx(ctx); // int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_model_n_layer(model); int n_layers = llama_n_layer(model);
int n_embd = llama_model_n_embd(model); int n_embd = llama_n_embd(model);
// get model hint param (a.k.a model arch name) // get model hint param (a.k.a model arch name)
char model_hint[128]; char model_hint[128];
llama_model_meta_val_str(model, "general.architecture", model_hint, 128); llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
@ -479,6 +474,8 @@ int main(int argc, char ** argv) {
// done with the model, we can now free it to make gain some memory // done with the model, we can now free it to make gain some memory
printf("Done evaluate prompts, unload model...\n"); printf("Done evaluate prompts, unload model...\n");
llama_free(ctx);
llama_free_model(model);
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;

View file

@ -15,7 +15,7 @@ static void run(
for (size_t il = 0; il < v_input.size(); ++il) { for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector // prepare output vector
struct ggml_tensor * ctrl_out = v_output[il]; struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%zu", il+1); ggml_format_name(ctrl_out, "direction.%ld", il+1);
// calculate mean vector // calculate mean vector
struct ggml_tensor * t_layer = v_input[il]; struct ggml_tensor * t_layer = v_input[il];

View file

@ -302,7 +302,7 @@ static void run_pca(
// prepare output vector // prepare output vector
struct ggml_tensor * ctrl_out = v_output[il]; struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%zu", il+1); ggml_format_name(ctrl_out, "direction.%ld", il+1);
// run power_iteration // run power_iteration
params.i_layer = il; params.i_layer = il;

View file

@ -12,7 +12,7 @@ int main(int argc, char** argv) {
} }
// Get only the program name from the full path // Get only the program name from the full path
auto pos = filename.find_last_of("/\\"); auto pos = filename.find_last_of('/');
if (pos != std::string::npos) { if (pos != std::string::npos) {
filename = filename.substr(pos+1); filename = filename.substr(pos+1);
} }

View file

@ -97,17 +97,14 @@ int main(int argc, char ** argv) {
// load the model // load the model
common_init_result llama_init = common_init_from_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context.get(); llama_context * ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@ -150,7 +147,7 @@ int main(int argc, char ** argv) {
// check if the last token is SEP // check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) { for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) { if (inp.empty() || inp.back() != llama_token_sep(model)) {
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__); LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
} }
@ -183,7 +180,7 @@ int main(int argc, char ** argv) {
} }
// allocate output // allocate output
const int n_embd = llama_model_n_embd(model); const int n_embd = llama_n_embd(model);
std::vector<float> embeddings(n_embd_count * n_embd, 0); std::vector<float> embeddings(n_embd_count * n_embd, 0);
float * emb = embeddings.data(); float * emb = embeddings.data();
@ -319,6 +316,8 @@ int main(int argc, char ** argv) {
// clean up // clean up
llama_batch_free(batch); llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -127,10 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
} }
static bool run(llama_context * ctx, const common_params & params) { static bool run(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
@ -165,9 +162,8 @@ int main(int argc, char ** argv) {
// init // init
common_init_result llama_init = common_init_from_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context.get(); llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__); LOG_ERR("%s : failed to init\n", __func__);
return 1; return 1;
@ -188,6 +184,9 @@ int main(int argc, char ** argv) {
LOG("\n"); LOG("\n");
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -1,13 +1,12 @@
#include "ggml.h"
#include "ggml-alloc.h"
#include "gguf.h"
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include <map> #include <map>
#include <vector> #include <vector>
#include <string> #include <string>
#include <thread>
#include <fstream> #include <fstream>
static bool g_verbose = false; static bool g_verbose = false;
@ -129,7 +128,7 @@ struct lora_merge_ctx {
lora_merge_ctx( lora_merge_ctx(
std::string & base_fname, std::string & base_fname,
std::vector<common_adapter_lora_info> & lora_files, std::vector<common_lora_adapter_info> & lora_files,
std::string & outfile, std::string & outfile,
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
fout.exceptions(std::ofstream::failbit); // fail fast on write errors fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@ -266,8 +265,8 @@ struct lora_merge_ctx {
fout.write((const char *)data.data(), data.size()); fout.write((const char *)data.data(), data.size());
} }
printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged); printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %zu tensors to output file\n", __func__, trans.size()); printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
} }
void copy_tensor(struct ggml_tensor * base) { void copy_tensor(struct ggml_tensor * base) {
@ -345,25 +344,15 @@ struct lora_merge_ctx {
gf = ggml_new_graph(ctx0); gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur = inp_base; struct ggml_tensor * cur = inp_base;
for (size_t i = 0; i < adapters.size(); ++i) { for (size_t i = 0; i < adapters.size(); ++i) {
struct ggml_tensor * delta; struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
bool is_tok_embd = string_starts_with(name_base, "token_embd"); struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
if (is_tok_embd) {
printf("%s : detected token embeddings tensor\n", __func__);
delta = ggml_mul_mat(ctx0,
ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
} else {
delta = ggml_mul_mat(ctx0,
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
}
// scale // scale
const float alpha = adapters[i]->alpha; const float alpha = adapters[i]->alpha;
const float rank = (float) inp_b[i]->ne[0]; const float rank = (float) inp_b[i]->ne[0];
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
delta = ggml_scale(ctx0, delta, scale); delta = ggml_scale(ctx0, delta, scale);
cur = ggml_add(ctx0, delta, cur); cur = ggml_add(ctx0, delta, cur);
printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
} }
cur = ggml_cast(ctx0, cur, out->type); cur = ggml_cast(ctx0, cur, out->type);

View file

@ -11,15 +11,19 @@
static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
const auto cpts = unicode_cpts_from_utf8(input_str); const auto cpts = unicode_cpts_from_utf8(input_str);
auto & stacks_cur = llama_grammar_get_stacks(grammar); const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
size_t pos = 0; size_t pos = 0;
for (const auto & cpt : cpts) { for (const auto & cpt : cpts) {
llama_grammar_accept(grammar, cpt); const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
if (stacks_cur.empty()) { if (stacks_cur.empty()) {
error_pos = pos; error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'"; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
stacks_cur = stacks_prev;
return false; return false;
} }
++pos; ++pos;
@ -76,10 +80,9 @@ int main(int argc, char** argv) {
grammar_str = buffer.str(); grammar_str = buffer.str();
} }
llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0); llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
if (grammar == nullptr) { if (grammar == nullptr) {
fprintf(stdout, "Failed to initialize llama_grammar\n"); throw std::runtime_error("Failed to initialize llama_grammar");
return 1;
} }
// Read the input file // Read the input file
std::string input_str; std::string input_str;

View file

@ -1,5 +1,4 @@
#include "ggml.h" #include "ggml.h"
#include "gguf.h"
#include <cstdlib> /* abort() */ #include <cstdlib> /* abort() */
#include <cstddef> #include <cstddef>

View file

@ -1,19 +1,18 @@
#include "ggml.h"
#include "gguf.h"
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include <algorithm> #include <algorithm>
#include <cinttypes> #include <cmath>
#include <climits>
#include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <stdexcept>
#include <cstring>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <stdio.h>
#include <string.h>
#include <climits>
#include <stdexcept>
#if defined(_WIN32) #if defined(_WIN32)
#include <windows.h> #include <windows.h>
#ifndef PATH_MAX #ifndef PATH_MAX
@ -288,7 +287,7 @@ struct split_strategy {
} }
void print_info() { void print_info() {
printf("n_split: %zu\n", ctx_outs.size()); printf("n_split: %ld\n", ctx_outs.size());
int i_split = 0; int i_split = 0;
for (auto & ctx_out : ctx_outs) { for (auto & ctx_out : ctx_outs) {
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors) // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@ -298,7 +297,7 @@ struct split_strategy {
total_size += ggml_nbytes(t); total_size += ggml_nbytes(t);
} }
total_size = total_size / 1000 / 1000; // convert to megabytes total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++; i_split++;
} }
} }

View file

@ -41,7 +41,7 @@ echo PASS
echo echo
# 2b. Test the sharded model is loading properly # 2b. Test the sharded model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 $MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS echo PASS
echo echo
@ -51,7 +51,7 @@ echo PASS
echo echo
# 3b. Test the merged model is loading properly # 3b. Test the merged model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 $MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS echo PASS
echo echo
@ -61,7 +61,7 @@ echo PASS
echo echo
# 4b. Test the sharded model is loading properly # 4b. Test the sharded model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 $MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS echo PASS
echo echo
@ -71,7 +71,7 @@ echo
#echo #echo
# 5b. Test the merged model is loading properly # 5b. Test the merged model is loading properly
#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 #$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS #echo PASS
#echo #echo
@ -81,7 +81,7 @@ echo PASS
echo echo
# 6b. Test the sharded model is loading properly # 6b. Test the sharded model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 $MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS echo PASS
echo echo

View file

@ -1,5 +1,4 @@
#include "ggml.h" #include "ggml.h"
#include "gguf.h"
#include <cstdio> #include <cstdio>
#include <string> #include <string>
@ -135,6 +134,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i); const char * name = gguf_get_tensor_name (ctx, i);
const size_t size = gguf_get_tensor_size (ctx, i); const size_t size = gguf_get_tensor_size (ctx, i);
// const size_t size = 0;
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset); printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
@ -183,6 +183,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i); const char * name = gguf_get_tensor_name (ctx, i);
const size_t size = gguf_get_tensor_size (ctx, i); const size_t size = gguf_get_tensor_size (ctx, i);
// const size_t size = 0;
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset); printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
@ -200,7 +201,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n", printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d) name = %s, data = %p\n",
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data); __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
// print first 10 elements // print first 10 elements

View file

@ -11,7 +11,6 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
std::vector<std::vector<float>> result; std::vector<std::vector<float>> result;
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
@ -20,16 +19,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
const std::string input_string = instruction + sentences[i]; const std::string input_string = instruction + sentences[i];
std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false); std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
const int32_t n_toks = inputs.size(); const int32_t n_toks = inputs.size();
// GritLM seems to have EOS = "" // GritLM seems to have EOS = ""
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
// inputs.push_back(llama_vocab_eos(vocab)); // inputs.push_back(llama_token_eos(model));
// we want to ignore instruction tokens for mean pooling // we want to ignore instruction tokens for mean pooling
const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size(); const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
#ifdef GRIT_DEBUG #ifdef GRIT_DEBUG
// debug tokens - should be matching as referenced in the GritLM sample // debug tokens - should be matching as referenced in the GritLM sample
@ -53,7 +52,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
llama_decode(ctx, batch); llama_decode(ctx, batch);
// get embedding dimensions // get embedding dimensions
uint64_t n_embd = llama_model_n_embd(model); uint64_t n_embd = llama_n_embd(model);
// allocate embedding output // allocate embedding output
std::vector<float> emb_unorm(n_embd, 0.0f); std::vector<float> emb_unorm(n_embd, 0.0f);
@ -76,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
} }
std::vector<float> emb_norm(emb_unorm.size()); std::vector<float> emb_norm(emb_unorm.size());
common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2); common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
result.push_back(emb_norm); result.push_back(emb_norm);
#ifdef GRIT_DEBUG #ifdef GRIT_DEBUG
@ -98,9 +97,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
std::string result; std::string result;
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model); llama_token eos_token = llama_token_eos(model);
llama_token eos_token = llama_vocab_eos(vocab);
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false); llama_set_embeddings(ctx, false);
@ -108,7 +105,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true); std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
int32_t i_current_token = 0; int32_t i_current_token = 0;
while (true) { while (true) {
@ -168,10 +165,10 @@ int main(int argc, char * argv[]) {
llama_backend_init(); llama_backend_init();
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams); llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
// create generation context // create generation context
llama_context * ctx = llama_init_from_model(model, cparams); llama_context * ctx = llama_new_context_with_model(model, cparams);
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
@ -200,7 +197,7 @@ int main(int argc, char * argv[]) {
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const int n_embd = llama_model_n_embd(model); const int n_embd = llama_n_embd(model);
const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@ -222,7 +219,7 @@ int main(int argc, char * argv[]) {
llama_sampler_free(smpl); llama_sampler_free(smpl);
llama_free(ctx); llama_free(ctx);
llama_model_free(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -7,6 +7,7 @@
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <sstream>
#include <thread> #include <thread>
#include <mutex> #include <mutex>
#include <vector> #include <vector>
@ -39,7 +40,7 @@ public:
void set_params(common_params params) { m_params = std::move(params); } void set_params(common_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const; void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * fname); bool load_imatrix(const char * file_name);
private: private:
std::unordered_map<std::string, Stats> m_stats; std::unordered_map<std::string, Stats> m_stats;
common_params m_params; common_params m_params;
@ -428,14 +429,10 @@ static void process_logits(
} }
static bool compute_imatrix(llama_context * ctx, const common_params & params) { static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx); const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
const llama_vocab * vocab = llama_model_get_vocab(model); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
const bool add_bos = llama_vocab_get_add_bos(vocab);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
@ -470,7 +467,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_vocab_n_tokens(vocab); const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
int count = 0; int count = 0;
@ -510,7 +507,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (add_bos && j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_vocab_bos(vocab); tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
} }
common_batch_clear(batch); common_batch_clear(batch);
@ -621,15 +618,14 @@ int main(int argc, char ** argv) {
// init // init
common_init_result llama_init = common_init_from_params(params); common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model.get(); llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context.get(); llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__); LOG_ERR("%s : failed to init\n", __func__);
return 1; return 1;
} }
const int n_ctx_train = llama_model_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) { if (params.n_ctx > n_ctx_train) {
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx); __func__, n_ctx_train, params.n_ctx);
@ -659,6 +655,9 @@ int main(int argc, char ** argv) {
LOG("\n"); LOG("\n");
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -131,17 +131,15 @@ int main(int argc, char ** argv) {
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
common_init_result llama_init = common_init_from_params(params); common_init_result llama_init = common_init_from_params(params);
model = llama_init.model.get(); model = llama_init.model;
ctx = llama_init.context.get(); ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG_DBG("n_ctx: %d\n", n_ctx); LOG_DBG("n_ctx: %d\n", n_ctx);
@ -154,28 +152,28 @@ int main(int argc, char ** argv) {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
} }
const bool add_bos = llama_vocab_get_add_bos(vocab); const bool add_bos = llama_add_bos_token(model);
GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); GGML_ASSERT(!llama_add_eos_token(model));
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end; std::vector<llama_token> embd_end;
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0); GGML_ASSERT(llama_token_fim_pre(model) >= 0);
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0); GGML_ASSERT(llama_token_fim_suf(model) >= 0);
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab)); inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab)); inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) { if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
} }
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
const llama_token middle_token = llama_vocab_fim_mid(vocab); const llama_token middle_token = llama_token_fim_mid(model);
if (middle_token >= 0) { if (middle_token >= 0) {
embd_inp.push_back(middle_token); embd_inp.push_back(middle_token);
} }
@ -187,7 +185,7 @@ int main(int argc, char ** argv) {
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
embd_inp.push_back(llama_vocab_bos(vocab)); embd_inp.push_back(llama_token_bos(model));
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} }
@ -422,10 +420,10 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs; // if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode // deal with eot token in infill mode
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){ if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) { if (is_interacting && !params.interactive_first) {
// print an eot token // print an eot token
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str()); LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
} }
LOG("\n"); LOG("\n");
console::set_display(console::user_input); console::set_display(console::user_input);
@ -465,13 +463,13 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab)); inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab)); inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) { if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
} }
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
@ -486,7 +484,7 @@ int main(int argc, char ** argv) {
is_interacting = false; is_interacting = false;
} }
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
LOG_DBG("found EOS token\n"); LOG_DBG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
@ -502,7 +500,7 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG_DBG("adding input prefix BOS token\n"); LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_vocab_bos(vocab)); embd_inp.push_back(llama_token_bos(model));
} }
std::string buffer; std::string buffer;
@ -565,7 +563,7 @@ int main(int argc, char ** argv) {
} }
// end of generation // end of generation
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) { if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
break; break;
} }
@ -577,12 +575,15 @@ int main(int argc, char ** argv) {
} }
} }
if (!params.interactive && n_remain <= 0) { if (!params.interactive && n_remain <= 0) {
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str()); LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
} }
LOG("\n"); LOG("\n");
common_perf_print(ctx, smpl); common_perf_print(ctx, smpl);
llama_free(ctx);
llama_free_model(model);
common_sampler_free(smpl); common_sampler_free(smpl);
llama_backend_free(); llama_backend_free();

View file

@ -683,7 +683,7 @@ struct cmd_params_instance {
bool cpu_strict; bool cpu_strict;
int poll; int poll;
int n_gpu_layers; int n_gpu_layers;
std::string rpc_servers_str; std::string rpc_servers;
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
@ -696,37 +696,8 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params(); llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers; mparams.n_gpu_layers = n_gpu_layers;
if (!rpc_servers_str.empty()) { if (!rpc_servers.empty()) {
auto rpc_servers = string_split<std::string>(rpc_servers_str, ','); mparams.rpc_servers = rpc_servers.c_str();
// add RPC devices
if (!rpc_servers.empty()) {
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
exit(1);
}
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
exit(1);
}
static std::vector<ggml_backend_dev_t> devices;
devices.clear();
for (const std::string & server : rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
devices.push_back(dev);
} else {
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
exit(1);
}
}
devices.push_back(nullptr);
mparams.devices = devices.data();
}
} }
mparams.split_mode = split_mode; mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu; mparams.main_gpu = main_gpu;
@ -737,7 +708,7 @@ struct cmd_params_instance {
} }
bool equal_mparams(const cmd_params_instance & other) const { bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
tensor_split == other.tensor_split; tensor_split == other.tensor_split;
} }
@ -1430,8 +1401,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model); const int32_t n_vocab = llama_n_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
std::vector<llama_token> tokens(n_batch); std::vector<llama_token> tokens(n_batch);
@ -1439,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
while (n_processed < n_prompt) { while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch); int n_tokens = std::min(n_prompt - n_processed, n_batch);
tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
for (int i = 1; i < n_tokens; i++) { for (int i = 1; i < n_tokens; i++) {
tokens[i] = std::rand() % n_vocab; tokens[i] = std::rand() % n_vocab;
} }
@ -1454,10 +1424,9 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model); const int32_t n_vocab = llama_n_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
for (int i = 0; i < n_gen; i++) { for (int i = 0; i < n_gen; i++) {
llama_decode(ctx, llama_batch_get_one(&token, 1)); llama_decode(ctx, llama_batch_get_one(&token, 1));
@ -1552,15 +1521,15 @@ int main(int argc, char ** argv) {
for (const auto & inst : params_instances) { for (const auto & inst : params_instances) {
params_idx++; params_idx++;
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
} }
// keep the same model between tests when possible // keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
if (lmodel) { if (lmodel) {
llama_model_free(lmodel); llama_free_model(lmodel);
} }
lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
if (lmodel == NULL) { if (lmodel == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
return 1; return 1;
@ -1568,10 +1537,10 @@ int main(int argc, char ** argv) {
prev_inst = &inst; prev_inst = &inst;
} }
llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams()); llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
llama_model_free(lmodel); llama_free_model(lmodel);
return 1; return 1;
} }
@ -1604,14 +1573,14 @@ int main(int argc, char ** argv) {
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
} }
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
} }
test_gen(ctx, 1, t.n_threads); test_gen(ctx, 1, t.n_threads);
} }
@ -1623,14 +1592,14 @@ int main(int argc, char ** argv) {
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count, fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
i + 1, params.reps); i + 1, params.reps);
} }
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
i + 1, params.reps); i + 1, params.reps);
} }
test_gen(ctx, t.n_gen, t.n_threads); test_gen(ctx, t.n_gen, t.n_threads);
@ -1657,7 +1626,7 @@ int main(int argc, char ** argv) {
ggml_threadpool_free_fn(threadpool); ggml_threadpool_free_fn(threadpool);
} }
llama_model_free(lmodel); llama_free_model(lmodel);
if (p) { if (p) {
p->print_footer(); p->print_footer();

View file

@ -19,7 +19,6 @@ android {
externalNativeBuild { externalNativeBuild {
cmake { cmake {
arguments += "-DLLAMA_BUILD_COMMON=ON" arguments += "-DLLAMA_BUILD_COMMON=ON"
arguments += "-DGGML_LLAMAFILE=OFF"
arguments += "-DCMAKE_BUILD_TYPE=Release" arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf() cppFlags += listOf()
arguments += listOf() arguments += listOf()

View file

@ -87,7 +87,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
auto path_to_model = env->GetStringUTFChars(filename, 0); auto path_to_model = env->GetStringUTFChars(filename, 0);
LOGi("Loading model from %s", path_to_model); LOGi("Loading model from %s", path_to_model);
auto model = llama_model_load_from_file(path_to_model, model_params); auto model = llama_load_model_from_file(path_to_model, model_params);
env->ReleaseStringUTFChars(filename, path_to_model); env->ReleaseStringUTFChars(filename, path_to_model);
if (!model) { if (!model) {
@ -102,7 +102,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) { Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
llama_model_free(reinterpret_cast<llama_model *>(model)); llama_free_model(reinterpret_cast<llama_model *>(model));
} }
extern "C" extern "C"
@ -305,9 +305,7 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
//llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer)); llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
delete batch;
} }
extern "C" extern "C"
@ -347,7 +345,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
jlong context_pointer, jlong context_pointer,
jlong batch_pointer, jlong batch_pointer,
jstring jtext, jstring jtext,
jboolean format_chat,
jint n_len jint n_len
) { ) {
@ -357,8 +354,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
const auto context = reinterpret_cast<llama_context *>(context_pointer); const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer); const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
bool parse_special = (format_chat == JNI_TRUE); const auto tokens_list = common_tokenize(context, text, 1);
const auto tokens_list = common_tokenize(context, text, true, parse_special);
auto n_ctx = llama_n_ctx(context); auto n_ctx = llama_n_ctx(context);
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@ -370,7 +366,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
} }
for (auto id : tokens_list) { for (auto id : tokens_list) {
LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id); LOGi("%s", common_token_to_piece(context, id).c_str());
} }
common_batch_clear(*batch); common_batch_clear(*batch);
@ -407,7 +403,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer); const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer); const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
const auto model = llama_get_model(context); const auto model = llama_get_model(context);
const auto vocab = llama_model_get_vocab(model);
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
@ -417,7 +412,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
const auto new_token_id = llama_sampler_sample(sampler, context, -1); const auto new_token_id = llama_sampler_sample(sampler, context, -1);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
return nullptr; return nullptr;
} }

View file

@ -65,7 +65,6 @@ class LLamaAndroid {
context: Long, context: Long,
batch: Long, batch: Long,
text: String, text: String,
formatChat: Boolean,
nLen: Int nLen: Int
): Int ): Int
@ -116,10 +115,10 @@ class LLamaAndroid {
} }
} }
fun send(message: String, formatChat: Boolean = false): Flow<String> = flow { fun send(message: String): Flow<String> = flow {
when (val state = threadLocalState.get()) { when (val state = threadLocalState.get()) {
is State.Loaded -> { is State.Loaded -> {
val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen)) val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) { while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur) val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
if (str == null) { if (str == null) {

Some files were not shown because too many files have changed in this diff Show more