commit
0e5165b605
52 changed files with 0 additions and 4783 deletions
|
@ -1,22 +0,0 @@
|
||||||
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
|
|
||||||
stage('Cleanup'){
|
|
||||||
cleanWs() // Cleaning previous CI build in workspace
|
|
||||||
}
|
|
||||||
stage('checkout repo'){
|
|
||||||
retry(5){ // Retry if the cloning fails due to some reason
|
|
||||||
checkout scm // Clone the repo on Runner
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Compiling llama.cpp'){
|
|
||||||
sh'''#!/bin/bash
|
|
||||||
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
|
|
||||||
'''
|
|
||||||
}
|
|
||||||
stage('Running llama.cpp'){
|
|
||||||
sh'''#!/bin/bash
|
|
||||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
|
||||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
|
||||||
cat llama_log.txt # Printing results
|
|
||||||
'''
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,36 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=11.7.1
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
|
||||||
# Enable CUDA
|
|
||||||
ENV GGML_CUDA=1
|
|
||||||
# Enable cURL
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
|
@ -1,50 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH=\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
# Enable cURL
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
|
@ -1,25 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
|
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
|
@ -1,35 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=11.7.1
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the CUDA runtime image
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
|
||||||
# Enable CUDA
|
|
||||||
ENV GGML_CUDA=1
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
|
@ -1,28 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with static libs" && \
|
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
|
||||||
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
|
@ -1,45 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH=\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
|
@ -1,27 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=jammy
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
|
||||||
|
|
||||||
# Install Vulkan SDK
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
COPY . .
|
|
||||||
RUN cmake -B build -DGGML_VULKAN=1 && \
|
|
||||||
cmake --build build --config Release --target llama-cli
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
WORKDIR /
|
|
||||||
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
|
||||||
rm -rf /app
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
|
@ -1,23 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/llama-cli /llama-cli
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
|
|
@ -1,83 +0,0 @@
|
||||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
|
||||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
|
||||||
|
|
||||||
# Notes for llama.cpp:
|
|
||||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
|
||||||
# We need to declare standard versioning if people want to sort latest releases.
|
|
||||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
|
||||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
|
||||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
|
||||||
|
|
||||||
Name: llama.cpp-cuda
|
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
|
||||||
Release: 1%{?dist}
|
|
||||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
|
||||||
License: MIT
|
|
||||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
|
||||||
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
|
|
||||||
Requires: cuda-toolkit
|
|
||||||
URL: https://github.com/ggerganov/llama.cpp
|
|
||||||
|
|
||||||
%define debug_package %{nil}
|
|
||||||
%define source_date_epoch_from_changelog 0
|
|
||||||
|
|
||||||
%description
|
|
||||||
CPU inference for Meta's Lllama2 models using default options.
|
|
||||||
|
|
||||||
%prep
|
|
||||||
%setup -n llama.cpp-master
|
|
||||||
|
|
||||||
%build
|
|
||||||
make -j GGML_CUDA=1
|
|
||||||
|
|
||||||
%install
|
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
|
||||||
[Unit]
|
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
|
||||||
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
|
||||||
Restart=never
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=default.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
mkdir -p %{buildroot}/etc/sysconfig
|
|
||||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
|
||||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
%clean
|
|
||||||
rm -rf %{buildroot}
|
|
||||||
rm -rf %{_builddir}/*
|
|
||||||
|
|
||||||
%files
|
|
||||||
%{_bindir}/llama-cuda-cli
|
|
||||||
%{_bindir}/llama-cuda-server
|
|
||||||
%{_bindir}/llama-cuda-simple
|
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
|
||||||
%config /etc/sysconfig/llama
|
|
||||||
|
|
||||||
%pre
|
|
||||||
|
|
||||||
%post
|
|
||||||
|
|
||||||
%preun
|
|
||||||
%postun
|
|
||||||
|
|
||||||
%changelog
|
|
|
@ -1,85 +0,0 @@
|
||||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
|
||||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
|
||||||
|
|
||||||
# Notes for llama.cpp:
|
|
||||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
|
||||||
# We need to declare standard versioning if people want to sort latest releases.
|
|
||||||
# In the meantime, YYYYMMDD format will be used.
|
|
||||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
|
||||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
|
||||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
|
||||||
|
|
||||||
Name: llama.cpp
|
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
|
||||||
Release: 1%{?dist}
|
|
||||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
|
||||||
License: MIT
|
|
||||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
|
||||||
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
|
|
||||||
Requires: libstdc++
|
|
||||||
URL: https://github.com/ggerganov/llama.cpp
|
|
||||||
|
|
||||||
%define debug_package %{nil}
|
|
||||||
%define source_date_epoch_from_changelog 0
|
|
||||||
|
|
||||||
%description
|
|
||||||
CPU inference for Meta's Lllama2 models using default options.
|
|
||||||
Models are not included in this package and must be downloaded separately.
|
|
||||||
|
|
||||||
%prep
|
|
||||||
%setup -n llama.cpp-master
|
|
||||||
|
|
||||||
%build
|
|
||||||
make -j
|
|
||||||
|
|
||||||
%install
|
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
|
||||||
[Unit]
|
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
|
||||||
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
|
||||||
Restart=never
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=default.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
mkdir -p %{buildroot}/etc/sysconfig
|
|
||||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
|
||||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
%clean
|
|
||||||
rm -rf %{buildroot}
|
|
||||||
rm -rf %{_builddir}/*
|
|
||||||
|
|
||||||
%files
|
|
||||||
%{_bindir}/llama-cli
|
|
||||||
%{_bindir}/llama-server
|
|
||||||
%{_bindir}/llama-simple
|
|
||||||
/usr/lib/systemd/system/llama.service
|
|
||||||
%config /etc/sysconfig/llama
|
|
||||||
|
|
||||||
%pre
|
|
||||||
|
|
||||||
%post
|
|
||||||
|
|
||||||
%preun
|
|
||||||
%postun
|
|
||||||
|
|
||||||
%changelog
|
|
|
@ -1,39 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG CUDA_VERSION=11.7.1
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
|
||||||
# Target the CUDA runtime image
|
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
|
||||||
# Enable CUDA
|
|
||||||
ENV GGML_CUDA=1
|
|
||||||
# Enable cURL
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
|
@ -1,32 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "GGML_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
echo "Building with dynamic libs" && \
|
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target llama-server
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
|
@ -1,52 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
|
||||||
ARG ROCM_VERSION=5.6
|
|
||||||
|
|
||||||
# Target the CUDA build image
|
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
|
||||||
ARG ROCM_DOCKER_ARCH=\
|
|
||||||
gfx803 \
|
|
||||||
gfx900 \
|
|
||||||
gfx906 \
|
|
||||||
gfx908 \
|
|
||||||
gfx90a \
|
|
||||||
gfx1010 \
|
|
||||||
gfx1030 \
|
|
||||||
gfx1100 \
|
|
||||||
gfx1101 \
|
|
||||||
gfx1102
|
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY requirements requirements
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Set nvcc architecture
|
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|
||||||
# Enable ROCm
|
|
||||||
ENV GGML_HIPBLAS=1
|
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
|
||||||
|
|
||||||
# Enable cURL
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
|
@ -1,29 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=jammy
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
# Install build tools
|
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
|
||||||
|
|
||||||
# Install Vulkan SDK and cURL
|
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
|
||||||
apt update -y && \
|
|
||||||
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
# Build it
|
|
||||||
WORKDIR /app
|
|
||||||
COPY . .
|
|
||||||
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
|
||||||
cmake --build build --config Release --target llama-server
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
WORKDIR /
|
|
||||||
RUN cp /app/build/bin/llama-server /llama-server && \
|
|
||||||
rm -rf /app
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
|
@ -1,27 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
|
|
@ -1,21 +0,0 @@
|
||||||
{
|
|
||||||
perSystem =
|
|
||||||
{ config, lib, ... }:
|
|
||||||
{
|
|
||||||
apps =
|
|
||||||
let
|
|
||||||
inherit (config.packages) default;
|
|
||||||
binaries = [
|
|
||||||
"llama-cli"
|
|
||||||
"llama-embedding"
|
|
||||||
"llama-server"
|
|
||||||
"llama-quantize"
|
|
||||||
];
|
|
||||||
mkApp = name: {
|
|
||||||
type = "app";
|
|
||||||
program = "${default}/bin/${name}";
|
|
||||||
};
|
|
||||||
in
|
|
||||||
lib.genAttrs binaries mkApp;
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,13 +0,0 @@
|
||||||
{
|
|
||||||
perSystem =
|
|
||||||
{ config, lib, ... }:
|
|
||||||
{
|
|
||||||
devShells =
|
|
||||||
lib.concatMapAttrs
|
|
||||||
(name: package: {
|
|
||||||
${name} = package.passthru.shell;
|
|
||||||
${name + "-extra"} = package.passthru.shell-extra;
|
|
||||||
})
|
|
||||||
config.packages;
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,37 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
dockerTools,
|
|
||||||
buildEnv,
|
|
||||||
llama-cpp,
|
|
||||||
interactive ? true,
|
|
||||||
coreutils,
|
|
||||||
}:
|
|
||||||
|
|
||||||
# A tar that can be fed into `docker load`:
|
|
||||||
#
|
|
||||||
# $ nix build .#llamaPackages.docker
|
|
||||||
# $ docker load < result
|
|
||||||
|
|
||||||
# For details and variations cf.
|
|
||||||
# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
|
|
||||||
# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
|
|
||||||
# - https://nixery.dev/
|
|
||||||
|
|
||||||
# Approximate (compressed) sizes, at the time of writing, are:
|
|
||||||
#
|
|
||||||
# .#llamaPackages.docker: 125M;
|
|
||||||
# .#llamaPackagesCuda.docker: 537M;
|
|
||||||
# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
|
|
||||||
|
|
||||||
dockerTools.buildLayeredImage {
|
|
||||||
name = llama-cpp.pname;
|
|
||||||
tag = "latest";
|
|
||||||
|
|
||||||
contents =
|
|
||||||
[ llama-cpp ]
|
|
||||||
++ lib.optionals interactive [
|
|
||||||
coreutils
|
|
||||||
dockerTools.binSh
|
|
||||||
dockerTools.caCertificates
|
|
||||||
];
|
|
||||||
}
|
|
|
@ -1,39 +0,0 @@
|
||||||
{ inputs, ... }:
|
|
||||||
{
|
|
||||||
perSystem =
|
|
||||||
{
|
|
||||||
config,
|
|
||||||
system,
|
|
||||||
lib,
|
|
||||||
pkgsCuda,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
{
|
|
||||||
legacyPackages =
|
|
||||||
let
|
|
||||||
caps.llamaPackagesXavier = "7.2";
|
|
||||||
caps.llamaPackagesOrin = "8.7";
|
|
||||||
caps.llamaPackagesTX2 = "6.2";
|
|
||||||
caps.llamaPackagesNano = "5.3";
|
|
||||||
|
|
||||||
pkgsFor =
|
|
||||||
cap:
|
|
||||||
import inputs.nixpkgs {
|
|
||||||
inherit system;
|
|
||||||
config = {
|
|
||||||
cudaSupport = true;
|
|
||||||
cudaCapabilities = [ cap ];
|
|
||||||
cudaEnableForwardCompat = false;
|
|
||||||
inherit (pkgsCuda.config) allowUnfreePredicate;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
in
|
|
||||||
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
|
|
||||||
|
|
||||||
packages = lib.optionalAttrs (system == "aarch64-linux") {
|
|
||||||
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
|
|
||||||
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
|
|
||||||
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,47 +0,0 @@
|
||||||
{ inputs, ... }:
|
|
||||||
{
|
|
||||||
# The _module.args definitions are passed on to modules as arguments. E.g.
|
|
||||||
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
|
|
||||||
# `_module.args.pkgs` (defined in this case by flake-parts).
|
|
||||||
perSystem =
|
|
||||||
{ system, ... }:
|
|
||||||
{
|
|
||||||
_module.args = {
|
|
||||||
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
|
|
||||||
# again, the below creates several nixpkgs instances which the
|
|
||||||
# flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
|
|
||||||
#
|
|
||||||
# This is currently "slow" and "expensive", on a certain scale.
|
|
||||||
# This also isn't "right" in that this hinders dependency injection at
|
|
||||||
# the level of flake inputs. This might get removed in the foreseeable
|
|
||||||
# future.
|
|
||||||
#
|
|
||||||
# Note that you can use these expressions without Nix
|
|
||||||
# (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
|
|
||||||
|
|
||||||
pkgsCuda = import inputs.nixpkgs {
|
|
||||||
inherit system;
|
|
||||||
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
|
|
||||||
# and ucx are built with CUDA support)
|
|
||||||
config.cudaSupport = true;
|
|
||||||
config.allowUnfreePredicate =
|
|
||||||
p:
|
|
||||||
builtins.all
|
|
||||||
(
|
|
||||||
license:
|
|
||||||
license.free
|
|
||||||
|| builtins.elem license.shortName [
|
|
||||||
"CUDA EULA"
|
|
||||||
"cuDNN EULA"
|
|
||||||
]
|
|
||||||
)
|
|
||||||
(p.meta.licenses or [ p.meta.license ]);
|
|
||||||
};
|
|
||||||
# Ensure dependencies use ROCm consistently
|
|
||||||
pkgsRocm = import inputs.nixpkgs {
|
|
||||||
inherit system;
|
|
||||||
config.rocmSupport = true;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,331 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
glibc,
|
|
||||||
config,
|
|
||||||
stdenv,
|
|
||||||
mkShell,
|
|
||||||
runCommand,
|
|
||||||
cmake,
|
|
||||||
ninja,
|
|
||||||
pkg-config,
|
|
||||||
git,
|
|
||||||
python3,
|
|
||||||
mpi,
|
|
||||||
blas,
|
|
||||||
cudaPackages,
|
|
||||||
darwin,
|
|
||||||
rocmPackages,
|
|
||||||
vulkan-headers,
|
|
||||||
vulkan-loader,
|
|
||||||
curl,
|
|
||||||
shaderc,
|
|
||||||
useBlas ? builtins.all (x: !x) [
|
|
||||||
useCuda
|
|
||||||
useMetalKit
|
|
||||||
useRocm
|
|
||||||
useVulkan
|
|
||||||
] && blas.meta.available,
|
|
||||||
useCuda ? config.cudaSupport,
|
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
|
||||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
|
||||||
useRocm ? config.rocmSupport,
|
|
||||||
enableCurl ? true,
|
|
||||||
useVulkan ? false,
|
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
|
||||||
|
|
||||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
|
||||||
# otherwise we get libstdc++ errors downstream.
|
|
||||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
|
||||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
|
||||||
precompileMetalShaders ? false
|
|
||||||
}@inputs:
|
|
||||||
|
|
||||||
let
|
|
||||||
inherit (lib)
|
|
||||||
cmakeBool
|
|
||||||
cmakeFeature
|
|
||||||
optionals
|
|
||||||
strings
|
|
||||||
versionOlder
|
|
||||||
;
|
|
||||||
|
|
||||||
stdenv = throw "Use effectiveStdenv instead";
|
|
||||||
|
|
||||||
suffices =
|
|
||||||
lib.optionals useBlas [ "BLAS" ]
|
|
||||||
++ lib.optionals useCuda [ "CUDA" ]
|
|
||||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
|
||||||
++ lib.optionals useMpi [ "MPI" ]
|
|
||||||
++ lib.optionals useRocm [ "ROCm" ]
|
|
||||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
|
||||||
|
|
||||||
pnameSuffix =
|
|
||||||
strings.optionalString (suffices != [ ])
|
|
||||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
|
||||||
descriptionSuffix =
|
|
||||||
strings.optionalString (suffices != [ ])
|
|
||||||
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
|
||||||
|
|
||||||
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
|
|
||||||
|
|
||||||
# TODO: package the Python in this repository in a Nix-like way.
|
|
||||||
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
|
||||||
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
|
||||||
# https://peps.python.org/pep-0517/
|
|
||||||
#
|
|
||||||
# TODO: Package up each Python script or service appropriately, by making
|
|
||||||
# them into "entrypoints"
|
|
||||||
llama-python = python3.withPackages (
|
|
||||||
ps: [
|
|
||||||
ps.numpy
|
|
||||||
ps.sentencepiece
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
|
||||||
llama-python-extra = python3.withPackages (
|
|
||||||
ps: [
|
|
||||||
ps.numpy
|
|
||||||
ps.sentencepiece
|
|
||||||
ps.tiktoken
|
|
||||||
ps.torchWithoutCuda
|
|
||||||
ps.transformers
|
|
||||||
|
|
||||||
# server bench
|
|
||||||
ps.matplotlib
|
|
||||||
|
|
||||||
# server tests
|
|
||||||
ps.openai
|
|
||||||
ps.behave
|
|
||||||
ps.prometheus-client
|
|
||||||
|
|
||||||
# for examples/pydantic-models-to-grammar-examples.py
|
|
||||||
ps.docstring-parser
|
|
||||||
ps.pydantic
|
|
||||||
|
|
||||||
# for scripts/compare-llama-bench.py
|
|
||||||
ps.gitpython
|
|
||||||
ps.tabulate
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
xcrunHost = runCommand "xcrunHost" {} ''
|
|
||||||
mkdir -p $out/bin
|
|
||||||
ln -s /usr/bin/xcrun $out/bin
|
|
||||||
'';
|
|
||||||
|
|
||||||
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
|
|
||||||
# separately
|
|
||||||
darwinBuildInputs =
|
|
||||||
with darwin.apple_sdk.frameworks;
|
|
||||||
[
|
|
||||||
Accelerate
|
|
||||||
CoreVideo
|
|
||||||
CoreGraphics
|
|
||||||
]
|
|
||||||
++ optionals useMetalKit [ MetalKit ];
|
|
||||||
|
|
||||||
cudaBuildInputs = with cudaPackages; [
|
|
||||||
cuda_cccl.dev # <nv/target>
|
|
||||||
|
|
||||||
# A temporary hack for reducing the closure size, remove once cudaPackages
|
|
||||||
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
|
||||||
cuda_cudart.dev
|
|
||||||
cuda_cudart.lib
|
|
||||||
cuda_cudart.static
|
|
||||||
libcublas.dev
|
|
||||||
libcublas.lib
|
|
||||||
libcublas.static
|
|
||||||
];
|
|
||||||
|
|
||||||
rocmBuildInputs = with rocmPackages; [
|
|
||||||
clr
|
|
||||||
hipblas
|
|
||||||
rocblas
|
|
||||||
];
|
|
||||||
|
|
||||||
vulkanBuildInputs = [
|
|
||||||
vulkan-headers
|
|
||||||
vulkan-loader
|
|
||||||
shaderc
|
|
||||||
];
|
|
||||||
in
|
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (
|
|
||||||
finalAttrs: {
|
|
||||||
pname = "llama-cpp${pnameSuffix}";
|
|
||||||
version = llamaVersion;
|
|
||||||
|
|
||||||
# Note: none of the files discarded here are visible in the sandbox or
|
|
||||||
# affect the output hash. This also means they can be modified without
|
|
||||||
# triggering a rebuild.
|
|
||||||
src = lib.cleanSourceWith {
|
|
||||||
filter =
|
|
||||||
name: type:
|
|
||||||
let
|
|
||||||
noneOf = builtins.all (x: !x);
|
|
||||||
baseName = baseNameOf name;
|
|
||||||
in
|
|
||||||
noneOf [
|
|
||||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
|
||||||
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
|
||||||
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
|
||||||
(baseName == "flake.lock")
|
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
|
|
||||||
postPatch = ''
|
|
||||||
substituteInPlace ./ggml/src/ggml-metal.m \
|
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
|
||||||
substituteInPlace ./ggml/src/ggml-metal.m \
|
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
|
||||||
'';
|
|
||||||
|
|
||||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
|
||||||
# `default.metallib` may be compiled with Metal compiler from XCode
|
|
||||||
# and we need to escape sandbox on MacOS to access Metal compiler.
|
|
||||||
# `xcrun` is used find the path of the Metal compiler, which is varible
|
|
||||||
# and not on $PATH
|
|
||||||
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
|
||||||
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
|
||||||
|
|
||||||
nativeBuildInputs =
|
|
||||||
[
|
|
||||||
cmake
|
|
||||||
ninja
|
|
||||||
pkg-config
|
|
||||||
git
|
|
||||||
]
|
|
||||||
++ optionals useCuda [
|
|
||||||
cudaPackages.cuda_nvcc
|
|
||||||
|
|
||||||
# TODO: Replace with autoAddDriverRunpath
|
|
||||||
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
|
||||||
cudaPackages.autoAddOpenGLRunpathHook
|
|
||||||
]
|
|
||||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
|
|
||||||
glibc.static
|
|
||||||
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
|
|
||||||
xcrunHost
|
|
||||||
];
|
|
||||||
|
|
||||||
buildInputs =
|
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
|
||||||
++ optionals useCuda cudaBuildInputs
|
|
||||||
++ optionals useMpi [ mpi ]
|
|
||||||
++ optionals useRocm rocmBuildInputs
|
|
||||||
++ optionals useBlas [ blas ]
|
|
||||||
++ optionals useVulkan vulkanBuildInputs
|
|
||||||
++ optionals enableCurl [ curl ];
|
|
||||||
|
|
||||||
cmakeFlags =
|
|
||||||
[
|
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
|
||||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
|
||||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
|
||||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
|
||||||
(cmakeBool "GGML_STATIC" enableStatic)
|
|
||||||
]
|
|
||||||
++ optionals useCuda [
|
|
||||||
(
|
|
||||||
with cudaPackages.flags;
|
|
||||||
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
|
||||||
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
++ optionals useRocm [
|
|
||||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
|
||||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
|
||||||
]
|
|
||||||
++ optionals useMetalKit [
|
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
|
||||||
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
|
||||||
];
|
|
||||||
|
|
||||||
# Environment variables needed for ROCm
|
|
||||||
env = optionals useRocm {
|
|
||||||
ROCM_PATH = "${rocmPackages.clr}";
|
|
||||||
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
|
||||||
};
|
|
||||||
|
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
|
||||||
# if they haven't been added yet.
|
|
||||||
postInstall = ''
|
|
||||||
mkdir -p $out/include
|
|
||||||
cp $src/include/llama.h $out/include/
|
|
||||||
'';
|
|
||||||
|
|
||||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
|
||||||
passthru = {
|
|
||||||
inherit
|
|
||||||
useBlas
|
|
||||||
useCuda
|
|
||||||
useMetalKit
|
|
||||||
useMpi
|
|
||||||
useRocm
|
|
||||||
useVulkan
|
|
||||||
;
|
|
||||||
|
|
||||||
shell = mkShell {
|
|
||||||
name = "shell-${finalAttrs.finalPackage.name}";
|
|
||||||
description = "contains numpy and sentencepiece";
|
|
||||||
buildInputs = [ llama-python ];
|
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
|
||||||
shellHook = ''
|
|
||||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
|
|
||||||
shell-extra = mkShell {
|
|
||||||
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
|
||||||
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
|
||||||
buildInputs = [ llama-python-extra ];
|
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
meta = {
|
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
|
||||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
|
||||||
license = lib.licenses.mit;
|
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
|
||||||
mainProgram = "llama-cli";
|
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
|
||||||
# Consider adding yourself to this list if you want to ensure this flake
|
|
||||||
# stays maintained and you're willing to invest your time. Do not add
|
|
||||||
# other people without their consent. Consider removing people after
|
|
||||||
# they've been unreachable for long periods of time.
|
|
||||||
|
|
||||||
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
|
||||||
# an attrset following the same format as in
|
|
||||||
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
|
||||||
maintainers = with lib.maintainers; [
|
|
||||||
philiptaron
|
|
||||||
SomeoneSerge
|
|
||||||
];
|
|
||||||
|
|
||||||
# Extend `badPlatforms` instead
|
|
||||||
platforms = lib.platforms.all;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
)
|
|
|
@ -1,19 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
newScope,
|
|
||||||
llamaVersion ? "0.0.0",
|
|
||||||
}:
|
|
||||||
|
|
||||||
# We're using `makeScope` instead of just writing out an attrset
|
|
||||||
# because it allows users to apply overlays later using `overrideScope'`.
|
|
||||||
# Cf. https://noogle.dev/f/lib/makeScope
|
|
||||||
|
|
||||||
lib.makeScope newScope (
|
|
||||||
self: {
|
|
||||||
inherit llamaVersion;
|
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
|
||||||
docker = self.callPackage ./docker.nix { };
|
|
||||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
|
||||||
sif = self.callPackage ./sif.nix { };
|
|
||||||
}
|
|
||||||
)
|
|
|
@ -1,27 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
singularity-tools,
|
|
||||||
llama-cpp,
|
|
||||||
bashInteractive,
|
|
||||||
interactive ? false,
|
|
||||||
}:
|
|
||||||
|
|
||||||
let
|
|
||||||
optionalInt = cond: x: if cond then x else 0;
|
|
||||||
in
|
|
||||||
singularity-tools.buildImage rec {
|
|
||||||
inherit (llama-cpp) name;
|
|
||||||
contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
|
|
||||||
|
|
||||||
# These are excessive (but safe) for most variants. Building singularity
|
|
||||||
# images requires superuser privileges, so we build them inside a VM in a
|
|
||||||
# writable image of pre-determined size.
|
|
||||||
#
|
|
||||||
# ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
|
|
||||||
#
|
|
||||||
# Expected image sizes:
|
|
||||||
# - cpu/blas: 150M,
|
|
||||||
# - cuda, all gencodes: 560M,
|
|
||||||
diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
|
|
||||||
memSize = diskSize;
|
|
||||||
}
|
|
|
@ -1,41 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Read the first argument into a variable
|
|
||||||
arg1="$1"
|
|
||||||
|
|
||||||
# Shift the arguments to remove the first one
|
|
||||||
shift
|
|
||||||
|
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
|
||||||
python3 ./convert_hf_to_gguf.py "$@"
|
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
|
||||||
./llama-quantize "$@"
|
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
|
||||||
./llama-cli "$@"
|
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
|
||||||
echo "Converting PTH to GGML..."
|
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
|
||||||
if [ -f "${i/f16/q4_0}" ]; then
|
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
|
||||||
else
|
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
|
||||||
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
|
||||||
./llama-server "$@"
|
|
||||||
else
|
|
||||||
echo "Unknown command: $arg1"
|
|
||||||
echo "Available commands: "
|
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
|
||||||
echo " --convert (-c): Convert a llama model into ggml"
|
|
||||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
|
||||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
|
||||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
|
||||||
echo " ex: \"/models/\" 7B"
|
|
||||||
echo " --server (-s): Run a model on the server"
|
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
|
|
||||||
fi
|
|
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
|
@ -1,50 +0,0 @@
|
||||||
name: Low Severity Bugs
|
|
||||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
|
||||||
title: "Bug: "
|
|
||||||
labels: ["bug-unconfirmed", "low severity"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
|
||||||
and the version of llama.cpp that you are using.
|
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
- type: textarea
|
|
||||||
id: what-happened
|
|
||||||
attributes:
|
|
||||||
label: What happened?
|
|
||||||
description: Also tell us, what did you expect to happen?
|
|
||||||
placeholder: Tell us what you see!
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: What operating system are you seeing the problem on?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
|
@ -1,50 +0,0 @@
|
||||||
name: Medium Severity Bug
|
|
||||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
|
||||||
title: "Bug: "
|
|
||||||
labels: ["bug-unconfirmed", "medium severity"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
|
||||||
and the version of llama.cpp that you are using.
|
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
- type: textarea
|
|
||||||
id: what-happened
|
|
||||||
attributes:
|
|
||||||
label: What happened?
|
|
||||||
description: Also tell us, what did you expect to happen?
|
|
||||||
placeholder: Tell us what you see!
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: What operating system are you seeing the problem on?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
|
@ -1,50 +0,0 @@
|
||||||
name: High Severity Bug
|
|
||||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
|
||||||
title: "Bug: "
|
|
||||||
labels: ["bug-unconfirmed", "high severity"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
|
||||||
and the version of llama.cpp that you are using.
|
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
- type: textarea
|
|
||||||
id: what-happened
|
|
||||||
attributes:
|
|
||||||
label: What happened?
|
|
||||||
description: Also tell us, what did you expect to happen?
|
|
||||||
placeholder: Tell us what you see!
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: What operating system are you seeing the problem on?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
|
@ -1,50 +0,0 @@
|
||||||
name: Critical Severity Bug
|
|
||||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
|
||||||
title: "Bug: "
|
|
||||||
labels: ["bug-unconfirmed", "critical severity"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thanks for taking the time to fill out this bug report!
|
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
|
||||||
and the version of llama.cpp that you are using.
|
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
- type: textarea
|
|
||||||
id: what-happened
|
|
||||||
attributes:
|
|
||||||
label: What happened?
|
|
||||||
description: Also tell us, what did you expect to happen?
|
|
||||||
placeholder: Tell us what you see!
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: version
|
|
||||||
attributes:
|
|
||||||
label: Name and Version
|
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
|
||||||
placeholder: |
|
|
||||||
$./llama-cli --version
|
|
||||||
version: 2999 (42b4109e)
|
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: operating-system
|
|
||||||
attributes:
|
|
||||||
label: What operating system are you seeing the problem on?
|
|
||||||
multiple: true
|
|
||||||
options:
|
|
||||||
- Linux
|
|
||||||
- Mac
|
|
||||||
- Windows
|
|
||||||
- BSD
|
|
||||||
- Other? (Please let us know in description)
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
|
@ -1,51 +0,0 @@
|
||||||
name: Enhancement
|
|
||||||
description: Used to request enhancements for llama.cpp
|
|
||||||
title: "Feature Request: "
|
|
||||||
labels: ["enhancement"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
|
||||||
|
|
||||||
- type: checkboxes
|
|
||||||
id: prerequisites
|
|
||||||
attributes:
|
|
||||||
label: Prerequisites
|
|
||||||
description: Please confirm the following before submitting your enhancement request.
|
|
||||||
options:
|
|
||||||
- label: I am running the latest code. Mention the version if possible as well.
|
|
||||||
required: true
|
|
||||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
|
||||||
required: true
|
|
||||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
|
||||||
required: true
|
|
||||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: feature-description
|
|
||||||
attributes:
|
|
||||||
label: Feature Description
|
|
||||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
|
||||||
placeholder: Detailed description of the enhancement
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: motivation
|
|
||||||
attributes:
|
|
||||||
label: Motivation
|
|
||||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
|
||||||
placeholder: Explanation of why this feature is needed and its benefits
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: possible-implementation
|
|
||||||
attributes:
|
|
||||||
label: Possible Implementation
|
|
||||||
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
|
||||||
placeholder: Detailed description of potential implementation
|
|
||||||
validations:
|
|
||||||
required: false
|
|
52
.github/ISSUE_TEMPLATE/06-research.yml
vendored
52
.github/ISSUE_TEMPLATE/06-research.yml
vendored
|
@ -1,52 +0,0 @@
|
||||||
name: Research
|
|
||||||
description: Track new technical research area
|
|
||||||
title: "Research: "
|
|
||||||
labels: ["research 🔬"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
|
||||||
|
|
||||||
- type: checkboxes
|
|
||||||
id: research-stage
|
|
||||||
attributes:
|
|
||||||
label: Research Stage
|
|
||||||
description: Track general state of this research ticket
|
|
||||||
options:
|
|
||||||
- label: Background Research (Let's try to avoid reinventing the wheel)
|
|
||||||
- label: Hypothesis Formed (How do you think this will work and it's effect?)
|
|
||||||
- label: Strategy / Implementation Forming
|
|
||||||
- label: Analysis of results
|
|
||||||
- label: Debrief / Documentation (So people in the future can learn from us)
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: background
|
|
||||||
attributes:
|
|
||||||
label: Previous existing literature and research
|
|
||||||
description: Whats the current state of the art and whats the motivation for this research?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: hypothesis
|
|
||||||
attributes:
|
|
||||||
label: Hypothesis
|
|
||||||
description: How do you think this will work and it's effect?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: implementation
|
|
||||||
attributes:
|
|
||||||
label: Implementation
|
|
||||||
description: Got an approach? e.g. a PR ready to go?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: analysis
|
|
||||||
attributes:
|
|
||||||
label: Analysis
|
|
||||||
description: How does the proposed implementation behave?
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: logs
|
|
||||||
attributes:
|
|
||||||
label: Relevant log output
|
|
||||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
|
||||||
render: shell
|
|
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
|
@ -1,28 +0,0 @@
|
||||||
name: Refactor (Maintainers)
|
|
||||||
description: Used to track refactoring opportunities
|
|
||||||
title: "Refactor: "
|
|
||||||
labels: ["refactor"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
|
||||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: background-description
|
|
||||||
attributes:
|
|
||||||
label: Background Description
|
|
||||||
description: Please provide a detailed written description of the pain points you are trying to solve.
|
|
||||||
placeholder: Detailed description behind your motivation to request refactor
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: possible-approaches
|
|
||||||
attributes:
|
|
||||||
label: Possible Refactor Approaches
|
|
||||||
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
|
|
||||||
placeholder: Your idea of possible refactoring opportunity/approaches
|
|
||||||
validations:
|
|
||||||
required: false
|
|
11
.github/ISSUE_TEMPLATE/config.yml
vendored
11
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,11 +0,0 @@
|
||||||
blank_issues_enabled: true
|
|
||||||
contact_links:
|
|
||||||
- name: Got an idea?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
|
||||||
about: Pop it there. It may then become an enhancement ticket.
|
|
||||||
- name: Got a question?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
|
||||||
about: Ask a question there!
|
|
||||||
- name: Want to contribute?
|
|
||||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
|
||||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
|
91
.github/labeler.yml
vendored
91
.github/labeler.yml
vendored
|
@ -1,91 +0,0 @@
|
||||||
# https://github.com/actions/labeler
|
|
||||||
Kompute:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-kompute.h
|
|
||||||
- ggml/src/ggml-kompute.cpp
|
|
||||||
- README-kompute.md
|
|
||||||
Apple Metal:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-metal.h
|
|
||||||
- ggml/src/ggml-metal.cpp
|
|
||||||
- README-metal.md
|
|
||||||
SYCL:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-sycl.h
|
|
||||||
- ggml/src/ggml-sycl.cpp
|
|
||||||
- ggml/src/ggml-sycl/**
|
|
||||||
- docs/backend/SYCL.md
|
|
||||||
- examples/sycl/**
|
|
||||||
Nvidia GPU:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml-cuda.h
|
|
||||||
- ggml/src/ggml-cuda/**
|
|
||||||
Vulkan:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/ggml_vk_generate_shaders.py
|
|
||||||
- ggml/src/ggml-vulkan*
|
|
||||||
documentation:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- docs/**
|
|
||||||
- media/**
|
|
||||||
testing:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- tests/**
|
|
||||||
build:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- cmake/**
|
|
||||||
- CMakeLists.txt
|
|
||||||
- CMakePresets.json
|
|
||||||
examples:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file: examples/**
|
|
||||||
devops:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- .devops/**
|
|
||||||
- .github/**
|
|
||||||
- ci/**
|
|
||||||
python:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- "**/*.py"
|
|
||||||
- requirements/**
|
|
||||||
- gguf-py/**
|
|
||||||
- .flake8
|
|
||||||
script:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- scripts/**
|
|
||||||
android:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- examples/llama.android/**
|
|
||||||
server:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- examples/server/**
|
|
||||||
ggml:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- ggml/include/ggml*.h
|
|
||||||
- ggml/src/ggml*.c
|
|
||||||
- ggml/src/ggml*.cpp
|
|
||||||
- ggml/src/ggml*.h
|
|
||||||
- ggml-cuda/**
|
|
||||||
nix:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file:
|
|
||||||
- "**/*.nix"
|
|
||||||
- .github/workflows/nix-*.yml
|
|
||||||
- .devops/nix/nixpkgs-instances.nix
|
|
||||||
embedding:
|
|
||||||
- changed-files:
|
|
||||||
- any-glob-to-any-file: examples/embedding/
|
|
7
.github/pull_request_template.md
vendored
7
.github/pull_request_template.md
vendored
|
@ -1,7 +0,0 @@
|
||||||
|
|
||||||
|
|
||||||
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
|
||||||
- Self-reported review complexity:
|
|
||||||
- [ ] Low
|
|
||||||
- [ ] Medium
|
|
||||||
- [ ] High
|
|
310
.github/workflows/bench.yml
vendored
310
.github/workflows/bench.yml
vendored
|
@ -1,310 +0,0 @@
|
||||||
# Benchmark
|
|
||||||
name: Benchmark
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
gpu-series:
|
|
||||||
description: 'Azure GPU series to run with'
|
|
||||||
required: true
|
|
||||||
type: choice
|
|
||||||
options:
|
|
||||||
- Standard_NC4as_T4_v3
|
|
||||||
- Standard_NC24ads_A100_v4
|
|
||||||
- Standard_NC80adis_H100_v5
|
|
||||||
sha:
|
|
||||||
description: 'Commit SHA1 to build'
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
duration:
|
|
||||||
description: 'Duration of the bench'
|
|
||||||
type: string
|
|
||||||
default: 10m
|
|
||||||
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
|
||||||
pull_request_target:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
|
||||||
schedule:
|
|
||||||
- cron: '04 2 * * *'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
bench-server-baseline:
|
|
||||||
runs-on: Standard_NC4as_T4_v3
|
|
||||||
env:
|
|
||||||
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
|
|
||||||
N_USERS: 8
|
|
||||||
DURATION: 10m
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
model: [phi-2]
|
|
||||||
ftype: [q4_0, q8_0, f16]
|
|
||||||
include:
|
|
||||||
- model: phi-2
|
|
||||||
ftype: q4_0
|
|
||||||
pr_comment_enabled: "true"
|
|
||||||
|
|
||||||
if: |
|
|
||||||
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|
|
||||||
|| (
|
|
||||||
github.event_name == 'schedule'
|
|
||||||
&& github.ref_name == 'master'
|
|
||||||
&& github.repository_owner == 'ggerganov'
|
|
||||||
)
|
|
||||||
|| github.event_name == 'pull_request_target'
|
|
||||||
|| (
|
|
||||||
github.event_name == 'push'
|
|
||||||
&& github.event.ref == 'refs/heads/master'
|
|
||||||
&& github.repository_owner == 'ggerganov'
|
|
||||||
)
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Install python env
|
|
||||||
id: pipenv
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
python3 -m venv venv
|
|
||||||
source venv/bin/activate
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
- name: Prometheus
|
|
||||||
id: install_prometheus
|
|
||||||
run: |
|
|
||||||
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
|
||||||
tar xzf prometheus*.tar.gz --strip-components=1
|
|
||||||
./prometheus --config.file=examples/server/bench/prometheus.yml &
|
|
||||||
while ! nc -z localhost 9090; do
|
|
||||||
sleep 0.1
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Set up Go
|
|
||||||
uses: actions/setup-go@v5
|
|
||||||
with:
|
|
||||||
go-version: '1.21'
|
|
||||||
|
|
||||||
- name: Install k6 and xk6-sse
|
|
||||||
id: k6_installation
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
go install go.k6.io/xk6/cmd/xk6@latest
|
|
||||||
xk6 build master \
|
|
||||||
--with github.com/phymbert/xk6-sse
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: cmake_build
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DLLAMA_CUBLAS=ON \
|
|
||||||
-DCUDAToolkit_ROOT=/usr/local/cuda \
|
|
||||||
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
|
|
||||||
-DCMAKE_CUDA_ARCHITECTURES=75 \
|
|
||||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
|
||||||
-DLLAMA_ALL_WARNINGS=OFF \
|
|
||||||
-DCMAKE_BUILD_TYPE=Release;
|
|
||||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Download the dataset
|
|
||||||
id: download_dataset
|
|
||||||
run: |
|
|
||||||
cd examples/server/bench
|
|
||||||
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
- name: Server bench
|
|
||||||
id: server_bench
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
cd examples/server/bench
|
|
||||||
source venv/bin/activate
|
|
||||||
python bench.py \
|
|
||||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
|
||||||
--name ${{ github.job }} \
|
|
||||||
--branch ${{ github.head_ref || github.ref_name }} \
|
|
||||||
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
|
||||||
--scenario script.js \
|
|
||||||
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
|
||||||
--hf-repo ggml-org/models \
|
|
||||||
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
|
|
||||||
--model-path-prefix /models \
|
|
||||||
--parallel ${{ env.N_USERS }} \
|
|
||||||
-ngl 33 \
|
|
||||||
--batch-size 2048 \
|
|
||||||
--ubatch-size 256 \
|
|
||||||
--ctx-size 16384 \
|
|
||||||
--n-prompts 1000 \
|
|
||||||
--max-prompt-tokens 1024 \
|
|
||||||
--max-tokens 2048
|
|
||||||
|
|
||||||
cat results.github.env >> $GITHUB_ENV
|
|
||||||
|
|
||||||
# Remove dataset as we do not want it in the artefact
|
|
||||||
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
compression-level: 9
|
|
||||||
path: |
|
|
||||||
examples/server/bench/*.jpg
|
|
||||||
examples/server/bench/*.json
|
|
||||||
examples/server/bench/*.log
|
|
||||||
|
|
||||||
- name: Commit status
|
|
||||||
uses: Sibz/github-status-action@v1
|
|
||||||
with:
|
|
||||||
authToken: ${{secrets.GITHUB_TOKEN}}
|
|
||||||
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
|
|
||||||
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
description: |
|
|
||||||
${{ env.BENCH_RESULTS }}
|
|
||||||
state: 'success'
|
|
||||||
|
|
||||||
- name: Upload benchmark images
|
|
||||||
uses: devicons/public-upload-to-imgur@v2.2.2
|
|
||||||
continue-on-error: true # Important as it looks unstable: 503
|
|
||||||
id: imgur_step
|
|
||||||
with:
|
|
||||||
client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
|
||||||
path: |
|
|
||||||
examples/server/bench/prompt_tokens_seconds.jpg
|
|
||||||
examples/server/bench/predicted_tokens_seconds.jpg
|
|
||||||
examples/server/bench/kv_cache_usage_ratio.jpg
|
|
||||||
examples/server/bench/requests_processing.jpg
|
|
||||||
|
|
||||||
- name: Extract mermaid
|
|
||||||
id: set_mermaid
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
cd examples/server/bench
|
|
||||||
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
|
||||||
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
|
|
||||||
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
|
|
||||||
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
|
|
||||||
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
|
|
||||||
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
|
|
||||||
echo "EOF" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Extract image url
|
|
||||||
id: extract_image_url
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
|
|
||||||
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
|
|
||||||
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Comment PR
|
|
||||||
uses: mshick/add-pr-comment@v2
|
|
||||||
id: comment_pr
|
|
||||||
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
|
|
||||||
with:
|
|
||||||
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
|
|
||||||
message: |
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
|
||||||
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>Expand details for performance related PR only</summary>
|
|
||||||
|
|
||||||
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
|
||||||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
|
||||||
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
|
||||||
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
|
|
||||||
- ${{ env.BENCH_GRAPH_XLABEL }}
|
|
||||||
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.PROMPT_TOKENS_SECONDS }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.PREDICTED_TOKENS_SECONDS }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>Details</summary>
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.KV_CACHE_USAGE_RATIO }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>More</summary>
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
${{ env.REQUESTS_PROCESSING }}
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
</p>
|
|
||||||
</details>
|
|
||||||
</details>
|
|
1315
.github/workflows/build.yml
vendored
1315
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load diff
23
.github/workflows/close-issue.yml
vendored
23
.github/workflows/close-issue.yml
vendored
|
@ -1,23 +0,0 @@
|
||||||
name: Close inactive issues
|
|
||||||
on:
|
|
||||||
schedule:
|
|
||||||
- cron: "42 0 * * *"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
close-issues:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
issues: write
|
|
||||||
pull-requests: write
|
|
||||||
steps:
|
|
||||||
- uses: actions/stale@v5
|
|
||||||
with:
|
|
||||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
|
|
||||||
days-before-issue-stale: 30
|
|
||||||
days-before-issue-close: 14
|
|
||||||
stale-issue-label: "stale"
|
|
||||||
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
|
||||||
days-before-pr-stale: -1
|
|
||||||
days-before-pr-close: -1
|
|
||||||
operations-per-run: 10000
|
|
||||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
|
116
.github/workflows/docker.yml
vendored
116
.github/workflows/docker.yml
vendored
|
@ -1,116 +0,0 @@
|
||||||
# This workflow uses actions that are not certified by GitHub.
|
|
||||||
# They are provided by a third-party and are governed by
|
|
||||||
# separate terms of service, privacy policy, and support
|
|
||||||
# documentation.
|
|
||||||
|
|
||||||
# GitHub recommends pinning actions to a commit SHA.
|
|
||||||
# To get a newer version, you will need to update the SHA.
|
|
||||||
# You can also reference a tag or branch, but the action may change without warning.
|
|
||||||
|
|
||||||
name: Publish Docker image
|
|
||||||
|
|
||||||
on:
|
|
||||||
#pull_request:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
push_to_registry:
|
|
||||||
name: Push Docker image to Docker Hub
|
|
||||||
#if: github.event.pull_request.draft == false
|
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
env:
|
|
||||||
COMMIT_SHA: ${{ github.sha }}
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
config:
|
|
||||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
|
||||||
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
steps:
|
|
||||||
- name: Check out the repo
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Set up QEMU
|
|
||||||
uses: docker/setup-qemu-action@v2
|
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v2
|
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
|
||||||
uses: docker/login-action@v2
|
|
||||||
with:
|
|
||||||
registry: ghcr.io
|
|
||||||
username: ${{ github.repository_owner }}
|
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
|
|
||||||
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
|
|
||||||
- name: Free Disk Space (Ubuntu)
|
|
||||||
uses: jlumbroso/free-disk-space@main
|
|
||||||
with:
|
|
||||||
# this might remove tools that are actually needed,
|
|
||||||
# if set to "true" but frees about 6 GB
|
|
||||||
tool-cache: false
|
|
||||||
|
|
||||||
# all of these default to true, but feel free to set to
|
|
||||||
# "false" if necessary for your workflow
|
|
||||||
android: true
|
|
||||||
dotnet: true
|
|
||||||
haskell: true
|
|
||||||
large-packages: true
|
|
||||||
docker-images: true
|
|
||||||
swap-storage: true
|
|
||||||
|
|
||||||
- name: Determine tag name
|
|
||||||
id: tag
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
|
||||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
|
||||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
|
||||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
|
||||||
else
|
|
||||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
|
||||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Downcase github.repository_owner
|
|
||||||
run: |
|
|
||||||
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
|
|
||||||
env:
|
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
|
||||||
|
|
||||||
- name: Build and push Docker image (versioned)
|
|
||||||
if: github.event_name == 'push'
|
|
||||||
uses: docker/build-push-action@v4
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: true
|
|
||||||
platforms: ${{ matrix.config.platforms }}
|
|
||||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
|
||||||
|
|
||||||
- name: Build and push Docker image (tagged)
|
|
||||||
uses: docker/build-push-action@v4
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: ${{ github.event_name == 'push' }}
|
|
||||||
platforms: ${{ matrix.config.platforms }}
|
|
||||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
|
27
.github/workflows/editorconfig.yml
vendored
27
.github/workflows/editorconfig.yml
vendored
|
@ -1,27 +0,0 @@
|
||||||
name: EditorConfig Checker
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allows manual triggering
|
|
||||||
inputs:
|
|
||||||
create_release:
|
|
||||||
description: 'Create new release'
|
|
||||||
required: true
|
|
||||||
type: boolean
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
editorconfig:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: editorconfig-checker/action-editorconfig-checker@main
|
|
||||||
- run: editorconfig-checker
|
|
44
.github/workflows/gguf-publish.yml
vendored
44
.github/workflows/gguf-publish.yml
vendored
|
@ -1,44 +0,0 @@
|
||||||
# This workflow will upload a Python Package using Twine when a GGUF release is created
|
|
||||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
|
||||||
|
|
||||||
# See `gguf-py/README.md` for how to make a release.
|
|
||||||
|
|
||||||
# This workflow uses actions that are not certified by GitHub.
|
|
||||||
# They are provided by a third-party and are governed by
|
|
||||||
# separate terms of service, privacy policy, and support
|
|
||||||
# documentation.
|
|
||||||
|
|
||||||
name: Upload Python Package
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
push:
|
|
||||||
# Pattern matched against refs/tags
|
|
||||||
tags:
|
|
||||||
- 'gguf-v*' # Push events to every version tag
|
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
deploy:
|
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: '3.9.x'
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
cd gguf-py
|
|
||||||
python -m pip install poetry
|
|
||||||
poetry install
|
|
||||||
|
|
||||||
- name: Build package
|
|
||||||
run: cd gguf-py && poetry build
|
|
||||||
- name: Publish package
|
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
|
||||||
with:
|
|
||||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
||||||
packages-dir: gguf-py/dist
|
|
17
.github/workflows/labeler.yml
vendored
17
.github/workflows/labeler.yml
vendored
|
@ -1,17 +0,0 @@
|
||||||
name: "Pull Request Labeler"
|
|
||||||
on:
|
|
||||||
- pull_request_target
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
labeler:
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
pull-requests: write
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
repository: "ggerganov/llama.cpp"
|
|
||||||
- uses: actions/labeler@v5
|
|
||||||
with:
|
|
||||||
configuration-path: '.github/labeler.yml'
|
|
65
.github/workflows/nix-ci-aarch64.yml
vendored
65
.github/workflows/nix-ci-aarch64.yml
vendored
|
@ -1,65 +0,0 @@
|
||||||
name: Nix aarch64 builds
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allows manual triggering
|
|
||||||
schedule:
|
|
||||||
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
|
||||||
# 1.5h instead of minutes with the cold cache).
|
|
||||||
#
|
|
||||||
# randint(0, 59), randint(0, 23)
|
|
||||||
- cron: '26 12 * * *'
|
|
||||||
# But also rebuild if we touched any of the Nix expressions:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['**/*.nix', 'flake.lock']
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['**/*.nix', 'flake.lock']
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
nix-build-aarch64:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install QEMU
|
|
||||||
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
|
||||||
sudo usermod -a -G kvm $USER
|
|
||||||
- name: Install Nix
|
|
||||||
uses: DeterminateSystems/nix-installer-action@v9
|
|
||||||
with:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
extra-conf: |
|
|
||||||
extra-platforms = aarch64-linux
|
|
||||||
extra-system-features = nixos-test kvm
|
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
||||||
with:
|
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
||||||
- name: Set-up cachix to push the results to
|
|
||||||
uses: cachix/cachix-action@v13
|
|
||||||
with:
|
|
||||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
|
||||||
name: llama-cpp
|
|
||||||
- name: Show all output paths
|
|
||||||
run: >
|
|
||||||
nix run github:nix-community/nix-eval-jobs
|
|
||||||
-- --gc-roots-dir gcroot
|
|
||||||
--flake
|
|
||||||
".#packages.aarch64-linux"
|
|
||||||
- name: Build
|
|
||||||
run: >
|
|
||||||
nix run github:Mic92/nix-fast-build
|
|
||||||
-- --skip-cached --no-nom
|
|
||||||
--systems aarch64-linux
|
|
||||||
--flake
|
|
||||||
".#checks.aarch64-linux"
|
|
72
.github/workflows/nix-ci.yml
vendored
72
.github/workflows/nix-ci.yml
vendored
|
@ -1,72 +0,0 @@
|
||||||
name: Nix CI
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allows manual triggering
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
nix-eval:
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
os: [ ubuntu-latest, macos-latest ]
|
|
||||||
runs-on: ${{ matrix.os }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Nix
|
|
||||||
uses: DeterminateSystems/nix-installer-action@v9
|
|
||||||
with:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
extra-conf: |
|
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
||||||
with:
|
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
||||||
- name: List all flake outputs
|
|
||||||
run: nix flake show --all-systems
|
|
||||||
- name: Show all output paths
|
|
||||||
run: >
|
|
||||||
nix run github:nix-community/nix-eval-jobs
|
|
||||||
-- --gc-roots-dir gcroot
|
|
||||||
--flake
|
|
||||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
|
||||||
nix-build:
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
os: [ ubuntu-latest, macos-latest ]
|
|
||||||
runs-on: ${{ matrix.os }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Nix
|
|
||||||
uses: DeterminateSystems/nix-installer-action@v9
|
|
||||||
with:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
extra-conf: |
|
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
||||||
with:
|
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
||||||
- name: Set-up cachix to push the results to
|
|
||||||
uses: cachix/cachix-action@v13
|
|
||||||
with:
|
|
||||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
|
||||||
name: llama-cpp
|
|
||||||
- name: Build
|
|
||||||
run: >
|
|
||||||
nix run github:Mic92/nix-fast-build
|
|
||||||
-- --skip-cached --no-nom
|
|
||||||
--flake
|
|
||||||
".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
|
22
.github/workflows/nix-flake-update.yml
vendored
22
.github/workflows/nix-flake-update.yml
vendored
|
@ -1,22 +0,0 @@
|
||||||
name: update-flake-lock
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
schedule:
|
|
||||||
- cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
lockfile:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Nix
|
|
||||||
uses: DeterminateSystems/nix-installer-action@main
|
|
||||||
- name: Update flake.lock
|
|
||||||
uses: DeterminateSystems/update-flake-lock@main
|
|
||||||
with:
|
|
||||||
pr-title: "nix: update flake.lock"
|
|
||||||
pr-labels: |
|
|
||||||
nix
|
|
||||||
pr-reviewers: philiptaron,SomeoneSerge
|
|
||||||
token: ${{ secrets.FLAKE_TOKEN }}
|
|
36
.github/workflows/nix-publish-flake.yml
vendored
36
.github/workflows/nix-publish-flake.yml
vendored
|
@ -1,36 +0,0 @@
|
||||||
# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
|
|
||||||
name: "Publish a flake to flakestry & flakehub"
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
- "*"
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
tag:
|
|
||||||
description: "The existing tag to publish"
|
|
||||||
type: "string"
|
|
||||||
required: true
|
|
||||||
jobs:
|
|
||||||
flakestry-publish:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
id-token: "write"
|
|
||||||
contents: "read"
|
|
||||||
steps:
|
|
||||||
- uses: flakestry/flakestry-publish@main
|
|
||||||
with:
|
|
||||||
version: "${{ inputs.tag || github.ref_name }}"
|
|
||||||
flakehub-publish:
|
|
||||||
runs-on: "ubuntu-latest"
|
|
||||||
permissions:
|
|
||||||
id-token: "write"
|
|
||||||
contents: "read"
|
|
||||||
steps:
|
|
||||||
- uses: "actions/checkout@v4"
|
|
||||||
with:
|
|
||||||
ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
|
|
||||||
- uses: "DeterminateSystems/nix-installer-action@main"
|
|
||||||
- uses: "DeterminateSystems/flakehub-push@main"
|
|
||||||
with:
|
|
||||||
visibility: "public"
|
|
||||||
tag: "${{ inputs.tag }}"
|
|
35
.github/workflows/python-check-requirements.yml
vendored
35
.github/workflows/python-check-requirements.yml
vendored
|
@ -1,35 +0,0 @@
|
||||||
name: Python check requirements.txt
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
|
||||||
- 'scripts/check-requirements.sh'
|
|
||||||
- 'convert*.py'
|
|
||||||
- 'requirements.txt'
|
|
||||||
- 'requirements/*.txt'
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
|
||||||
- 'scripts/check-requirements.sh'
|
|
||||||
- 'convert*.py'
|
|
||||||
- 'requirements.txt'
|
|
||||||
- 'requirements/*.txt'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
python-check-requirements:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
name: check-requirements
|
|
||||||
steps:
|
|
||||||
- name: Check out source repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Set up Python environment
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Run check-requirements.sh script
|
|
||||||
run: bash scripts/check-requirements.sh
|
|
23
.github/workflows/python-lint.yml
vendored
23
.github/workflows/python-lint.yml
vendored
|
@ -1,23 +0,0 @@
|
||||||
name: flake8 Lint
|
|
||||||
|
|
||||||
on: [push, pull_request]
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
flake8-lint:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
name: Lint
|
|
||||||
steps:
|
|
||||||
- name: Check out source repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Set up Python environment
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: flake8 Lint
|
|
||||||
uses: py-actions/flake8@v2
|
|
||||||
with:
|
|
||||||
plugins: "flake8-no-print"
|
|
38
.github/workflows/python-type-check.yml
vendored
38
.github/workflows/python-type-check.yml
vendored
|
@ -1,38 +0,0 @@
|
||||||
name: Python Type-Check
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/python-type-check.yml'
|
|
||||||
- '**.py'
|
|
||||||
- '**/requirements*.txt'
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/python-type-check.yml'
|
|
||||||
- '**.py'
|
|
||||||
- '**/requirements*.txt'
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
python-type-check:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
name: pyright type-check
|
|
||||||
steps:
|
|
||||||
- name: Check out source repository
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Set up Python environment
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Install Python dependencies
|
|
||||||
# TODO: use a venv
|
|
||||||
run: pip install -r requirements/requirements-all.txt
|
|
||||||
- name: Type-check with Pyright
|
|
||||||
uses: jakebailey/pyright-action@v2
|
|
||||||
with:
|
|
||||||
version: 1.1.370
|
|
||||||
level: warning
|
|
||||||
warnings: true
|
|
183
.github/workflows/server.yml
vendored
183
.github/workflows/server.yml
vendored
|
@ -1,183 +0,0 @@
|
||||||
# Server build and tests
|
|
||||||
name: Server
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allows manual triggering
|
|
||||||
inputs:
|
|
||||||
sha:
|
|
||||||
description: 'Commit SHA1 to build'
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
slow_tests:
|
|
||||||
description: 'Run slow tests'
|
|
||||||
required: true
|
|
||||||
type: boolean
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
server:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
|
||||||
build_type: [RelWithDebInfo]
|
|
||||||
include:
|
|
||||||
- build_type: Release
|
|
||||||
sanitizer: ""
|
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get -y install \
|
|
||||||
build-essential \
|
|
||||||
xxd \
|
|
||||||
git \
|
|
||||||
cmake \
|
|
||||||
curl \
|
|
||||||
wget \
|
|
||||||
language-pack-en \
|
|
||||||
libcurl4-openssl-dev
|
|
||||||
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Python setup
|
|
||||||
id: setup_python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: '3.11'
|
|
||||||
|
|
||||||
- name: Tests dependencies
|
|
||||||
id: test_dependencies
|
|
||||||
run: |
|
|
||||||
pip install -r examples/server/tests/requirements.txt
|
|
||||||
|
|
||||||
- name: Verify server deps
|
|
||||||
id: verify_server_deps
|
|
||||||
run: |
|
|
||||||
git config --global --add safe.directory $(realpath .)
|
|
||||||
cd examples/server
|
|
||||||
git ls-files --others --modified
|
|
||||||
git status
|
|
||||||
./deps.sh
|
|
||||||
git status
|
|
||||||
not_ignored_files="$(git ls-files --others --modified)"
|
|
||||||
echo "Modified files: ${not_ignored_files}"
|
|
||||||
if [ -n "${not_ignored_files}" ]; then
|
|
||||||
echo "Repository is dirty or server deps are not built as expected"
|
|
||||||
echo "${not_ignored_files}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Build (no OpenMP)
|
|
||||||
id: cmake_build_no_openmp
|
|
||||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
|
||||||
-DGGML_OPENMP=OFF ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: cmake_build
|
|
||||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DLLAMA_CURL=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Tests
|
|
||||||
id: server_integration_tests
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
PORT=8888 ./tests.sh
|
|
||||||
|
|
||||||
- name: Slow tests
|
|
||||||
id: server_integration_tests_slow
|
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
|
||||||
runs-on: windows-2019
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: libCURL
|
|
||||||
id: get_libcurl
|
|
||||||
env:
|
|
||||||
CURL_VERSION: 8.6.0_6
|
|
||||||
run: |
|
|
||||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
|
|
||||||
mkdir $env:RUNNER_TEMP/libcurl
|
|
||||||
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: cmake_build
|
|
||||||
run: |
|
|
||||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
|
||||||
|
|
||||||
- name: Python setup
|
|
||||||
id: setup_python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: '3.11'
|
|
||||||
|
|
||||||
- name: Tests dependencies
|
|
||||||
id: test_dependencies
|
|
||||||
run: |
|
|
||||||
pip install -r examples/server/tests/requirements.txt
|
|
||||||
|
|
||||||
- name: Copy Libcurl
|
|
||||||
id: prepare_libcurl
|
|
||||||
run: |
|
|
||||||
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
|
|
||||||
|
|
||||||
- name: Tests
|
|
||||||
id: server_integration_tests
|
|
||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
|
||||||
|
|
||||||
- name: Slow tests
|
|
||||||
id: server_integration_tests_slow
|
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
|
||||||
run: |
|
|
||||||
cd examples/server/tests
|
|
||||||
behave.exe --stop --no-skipped --no-capture --tags slow
|
|
29
ci/README.md
29
ci/README.md
|
@ -1,29 +0,0 @@
|
||||||
# CI
|
|
||||||
|
|
||||||
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
|
||||||
|
|
||||||
https://github.com/ggml-org/ci
|
|
||||||
|
|
||||||
It monitors the `master` branch for new commits and runs the
|
|
||||||
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
|
||||||
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
|
||||||
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
|
||||||
|
|
||||||
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
|
|
||||||
Only the branches of this repo are monitored for this keyword.
|
|
||||||
|
|
||||||
It is a good practice, before publishing changes to execute the full CI locally on your machine:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
mkdir tmp
|
|
||||||
|
|
||||||
# CPU-only build
|
|
||||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
|
|
||||||
# with CUDA support
|
|
||||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
|
|
||||||
# with SYCL support
|
|
||||||
source /opt/intel/oneapi/setvars.sh
|
|
||||||
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
```
|
|
775
ci/run.sh
775
ci/run.sh
|
@ -1,775 +0,0 @@
|
||||||
#/bin/bash
|
|
||||||
#
|
|
||||||
# sample usage:
|
|
||||||
#
|
|
||||||
# mkdir tmp
|
|
||||||
#
|
|
||||||
# # CPU-only build
|
|
||||||
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
#
|
|
||||||
# # with CUDA support
|
|
||||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
#
|
|
||||||
# # with SYCL support
|
|
||||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|
||||||
#
|
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p "$1"
|
|
||||||
mkdir -p "$2"
|
|
||||||
|
|
||||||
OUT=$(realpath "$1")
|
|
||||||
MNT=$(realpath "$2")
|
|
||||||
|
|
||||||
rm -f "$OUT/*.log"
|
|
||||||
rm -f "$OUT/*.exit"
|
|
||||||
rm -f "$OUT/*.md"
|
|
||||||
|
|
||||||
sd=`dirname $0`
|
|
||||||
cd $sd/../
|
|
||||||
SRC=`pwd`
|
|
||||||
|
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
|
||||||
if [ -z ${ONEAPI_ROOT} ]; then
|
|
||||||
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
|
|
||||||
echo "source /opt/intel/oneapi/setvars.sh"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
|
||||||
fi
|
|
||||||
## helpers
|
|
||||||
|
|
||||||
# download a file if it does not exist or if it is outdated
|
|
||||||
function gg_wget {
|
|
||||||
local out=$1
|
|
||||||
local url=$2
|
|
||||||
|
|
||||||
local cwd=`pwd`
|
|
||||||
|
|
||||||
mkdir -p $out
|
|
||||||
cd $out
|
|
||||||
|
|
||||||
# should not re-download if file is the same
|
|
||||||
wget -nv -N $url
|
|
||||||
|
|
||||||
cd $cwd
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_printf {
|
|
||||||
printf -- "$@" >> $OUT/README.md
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_run {
|
|
||||||
ci=$1
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
set -x
|
|
||||||
|
|
||||||
gg_run_$ci | tee $OUT/$ci.log
|
|
||||||
cur=$?
|
|
||||||
echo "$cur" > $OUT/$ci.exit
|
|
||||||
|
|
||||||
set +x
|
|
||||||
set +o pipefail
|
|
||||||
|
|
||||||
gg_sum_$ci
|
|
||||||
|
|
||||||
ret=$((ret | cur))
|
|
||||||
}
|
|
||||||
|
|
||||||
## ci
|
|
||||||
|
|
||||||
# ctest_debug
|
|
||||||
|
|
||||||
function gg_run_ctest_debug {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Check cmake, make and ctest are installed
|
|
||||||
gg_check_build_requirements
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_ctest_debug {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs ctest in debug mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
# ctest_release
|
|
||||||
|
|
||||||
function gg_run_ctest_release {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Check cmake, make and ctest are installed
|
|
||||||
gg_check_build_requirements
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|
||||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
|
||||||
else
|
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
|
||||||
fi
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_ctest_release {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs ctest in release mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
# test_scripts_debug
|
|
||||||
|
|
||||||
function gg_run_test_scripts_debug {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_test_scripts_debug {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs test scripts in debug mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
# test_scripts_release
|
|
||||||
|
|
||||||
function gg_run_test_scripts_release {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_test_scripts_release {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs test scripts in release mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_get_model {
|
|
||||||
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
|
||||||
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
|
||||||
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
|
||||||
if [[ -s $gguf_0 ]]; then
|
|
||||||
echo -n "$gguf_0"
|
|
||||||
elif [[ -s $gguf_1 ]]; then
|
|
||||||
echo -n "$gguf_1"
|
|
||||||
elif [[ -s $gguf_2 ]]; then
|
|
||||||
echo -n "$gguf_2"
|
|
||||||
else
|
|
||||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_run_ctest_with_model_debug {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
local model; model=$(gg_get_model)
|
|
||||||
cd build-ci-debug
|
|
||||||
set -e
|
|
||||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
|
||||||
set +e
|
|
||||||
cd ..
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_run_ctest_with_model_release {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
local model; model=$(gg_get_model)
|
|
||||||
cd build-ci-release
|
|
||||||
set -e
|
|
||||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
|
||||||
set +e
|
|
||||||
cd ..
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_ctest_with_model_debug {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs ctest with model files in debug mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_ctest_with_model_release {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Runs ctest with model files in release mode\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
|
||||||
gg_printf '```\n'
|
|
||||||
}
|
|
||||||
|
|
||||||
# open_llama_7b_v2
|
|
||||||
# requires: GG_BUILD_CUDA
|
|
||||||
|
|
||||||
function gg_run_open_llama_7b_v2 {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
|
||||||
|
|
||||||
path_models="../models-mnt/open-llama/7B-v2"
|
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
|
||||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
|
||||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
|
||||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
|
||||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
|
||||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
|
||||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
|
||||||
qnt="$1"
|
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_open_llama_7b_v2 {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
||||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
|
||||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
|
||||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
|
||||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
|
||||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# pythia_1.4b
|
|
||||||
|
|
||||||
function gg_run_pythia_1_4b {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
|
||||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
|
||||||
|
|
||||||
path_models="../models-mnt/pythia/1.4B"
|
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
|
||||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
|
||||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
|
||||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
|
||||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
|
||||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
|
||||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
|
||||||
qnt="$1"
|
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_pythia_1_4b {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Pythia 1.4B:\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
||||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
|
||||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
|
||||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
|
||||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
|
||||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# pythia_2_8b
|
|
||||||
# requires: GG_BUILD_CUDA
|
|
||||||
|
|
||||||
function gg_run_pythia_2_8b {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
|
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
|
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
|
||||||
|
|
||||||
path_models="../models-mnt/pythia/2.8B"
|
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
|
||||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
|
||||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
|
||||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
|
||||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
|
||||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
|
||||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
|
||||||
qnt="$1"
|
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_pythia_2_8b {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'Pythia 2.8B:\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
||||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
|
||||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
|
||||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
|
||||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
|
||||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# bge-small
|
|
||||||
|
|
||||||
function gg_run_embd_bge_small {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
|
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
|
|
||||||
|
|
||||||
path_models="../models-mnt/bge-small"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_check_build_requirements {
|
|
||||||
if ! command -v cmake &> /dev/null; then
|
|
||||||
gg_printf 'cmake not found, please install'
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v make &> /dev/null; then
|
|
||||||
gg_printf 'make not found, please install'
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v ctest &> /dev/null; then
|
|
||||||
gg_printf 'ctest not found, please install'
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_embd_bge_small {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'BGE Small (BERT):\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
## main
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|
||||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
|
||||||
rm -rf ${SRC}/models-mnt
|
|
||||||
mnt_models=${MNT}/models
|
|
||||||
mkdir -p ${mnt_models}
|
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
|
||||||
|
|
||||||
# Create a fresh python3 venv and enter it
|
|
||||||
python3 -m venv "$MNT/venv"
|
|
||||||
source "$MNT/venv/bin/activate"
|
|
||||||
|
|
||||||
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
|
||||||
pip install --editable gguf-py --disable-pip-version-check
|
|
||||||
fi
|
|
||||||
|
|
||||||
ret=0
|
|
||||||
|
|
||||||
test $ret -eq 0 && gg_run ctest_debug
|
|
||||||
test $ret -eq 0 && gg_run ctest_release
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|
||||||
test $ret -eq 0 && gg_run embd_bge_small
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
|
||||||
test $ret -eq 0 && gg_run test_scripts_debug
|
|
||||||
test $ret -eq 0 && gg_run test_scripts_release
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
|
||||||
test $ret -eq 0 && gg_run pythia_1_4b
|
|
||||||
else
|
|
||||||
test $ret -eq 0 && gg_run pythia_2_8b
|
|
||||||
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
|
||||||
fi
|
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit $ret
|
|
Loading…
Add table
Add a link
Reference in a new issue