Merge branch 'master' into openelm
This commit is contained in:
commit
51b2577dd4
628 changed files with 205721 additions and 125563 deletions
|
@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
||||||
stage('Running llama.cpp'){
|
stage('Running llama.cpp'){
|
||||||
sh'''#!/bin/bash
|
sh'''#!/bin/bash
|
||||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||||
cat llama_log.txt # Printing results
|
cat llama_log.txt # Printing results
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
|
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
COPY requirements requirements
|
COPY requirements requirements
|
||||||
|
@ -27,10 +27,10 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV GGML_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
|
|
@ -36,7 +36,7 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
# Enable ROCm
|
# Enable ROCm
|
||||||
ENV LLAMA_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
|
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
|
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
COPY requirements requirements
|
COPY requirements requirements
|
||||||
|
@ -18,7 +18,7 @@ COPY . .
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
|
|
@ -21,12 +21,15 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV GGML_CUDA=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
26
.devops/llama-cli-intel.Dockerfile
Normal file
26
.devops/llama-cli-intel.Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG GGML_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "GGML_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -36,10 +36,10 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
# Enable ROCm
|
# Enable ROCm
|
||||||
ENV LLAMA_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/main" ]
|
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
# Install build tools
|
# Install build tools
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
||||||
|
|
||||||
# Install Vulkan SDK
|
# Install Vulkan SDK
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
@ -14,14 +14,14 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
# Build it
|
# Build it
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
RUN cmake -B build -DGGML_VULKAN=1 && \
|
||||||
cmake --build build --config Release --target main
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/main /main && \
|
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -9,12 +9,15 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,84 +0,0 @@
|
||||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
|
||||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
|
||||||
|
|
||||||
# Notes for llama.cpp:
|
|
||||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
|
||||||
# We need to declare standard versioning if people want to sort latest releases.
|
|
||||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
|
||||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
|
||||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
|
||||||
|
|
||||||
Name: llama.cpp-clblast
|
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
|
||||||
Release: 1%{?dist}
|
|
||||||
Summary: OpenCL Inference of LLaMA model in C/C++
|
|
||||||
License: MIT
|
|
||||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
|
||||||
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
|
|
||||||
Requires: clblast
|
|
||||||
URL: https://github.com/ggerganov/llama.cpp
|
|
||||||
|
|
||||||
%define debug_package %{nil}
|
|
||||||
%define source_date_epoch_from_changelog 0
|
|
||||||
|
|
||||||
%description
|
|
||||||
CPU inference for Meta's Lllama2 models using default options.
|
|
||||||
|
|
||||||
%prep
|
|
||||||
%setup -n llama.cpp-master
|
|
||||||
|
|
||||||
%build
|
|
||||||
make -j LLAMA_CLBLAST=1
|
|
||||||
|
|
||||||
%install
|
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
|
||||||
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
|
||||||
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
|
||||||
[Unit]
|
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
|
||||||
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
|
||||||
Restart=never
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=default.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
mkdir -p %{buildroot}/etc/sysconfig
|
|
||||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
|
||||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
%clean
|
|
||||||
rm -rf %{buildroot}
|
|
||||||
rm -rf %{_builddir}/*
|
|
||||||
|
|
||||||
%files
|
|
||||||
%{_bindir}/llamaclblast
|
|
||||||
%{_bindir}/llamaclblastserver
|
|
||||||
%{_bindir}/llamaclblastsimple
|
|
||||||
/usr/lib/systemd/system/llamaclblast.service
|
|
||||||
%config /etc/sysconfig/llama
|
|
||||||
|
|
||||||
|
|
||||||
%pre
|
|
||||||
|
|
||||||
%post
|
|
||||||
|
|
||||||
%preun
|
|
||||||
%postun
|
|
||||||
|
|
||||||
%changelog
|
|
|
@ -32,13 +32,13 @@ CPU inference for Meta's Lllama2 models using default options.
|
||||||
%setup -n llama.cpp-master
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
%build
|
%build
|
||||||
make -j LLAMA_CUDA=1
|
make -j GGML_CUDA=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llamacppcuda
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llamacppcuda
|
%{_bindir}/llama-cuda-cli
|
||||||
%{_bindir}/llamacppcudaserver
|
%{_bindir}/llama-cuda-server
|
||||||
%{_bindir}/llamacppcudasimple
|
%{_bindir}/llama-cuda-simple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacuda.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -38,9 +38,9 @@ make -j
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llama
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamaserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||||
|
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -69,9 +69,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama
|
%{_bindir}/llama-cli
|
||||||
%{_bindir}/llamaserver
|
%{_bindir}/llama-server
|
||||||
%{_bindir}/llamasimple
|
%{_bindir}/llama-simple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -21,17 +21,19 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV GGML_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
31
.devops/llama-server-intel.Dockerfile
Normal file
31
.devops/llama-server-intel.Dockerfile
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG GGML_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "GGML_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -36,15 +36,17 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
# Enable ROCm
|
# Enable ROCm
|
||||||
ENV LLAMA_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/server" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -5,27 +5,25 @@ FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
# Install build tools
|
# Install build tools
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
# Install Vulkan SDK
|
# Install Vulkan SDK and cURL
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
apt update -y && \
|
apt update -y && \
|
||||||
apt-get install -y vulkan-sdk
|
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
||||||
|
|
||||||
# Install cURL
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
# Build it
|
# Build it
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
cmake --build build --config Release --target server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/server /server && \
|
RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
27
.devops/llama-server.Dockerfile
Normal file
27
.devops/llama-server.Dockerfile
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,26 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "LLAMA_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target main
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/main /main
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
|
|
@ -6,11 +6,11 @@
|
||||||
let
|
let
|
||||||
inherit (config.packages) default;
|
inherit (config.packages) default;
|
||||||
binaries = [
|
binaries = [
|
||||||
"llama"
|
"llama-cli"
|
||||||
"llama-embedding"
|
"llama-embedding"
|
||||||
"llama-server"
|
"llama-server"
|
||||||
"quantize"
|
"llama-quantize"
|
||||||
"train-text-from-scratch"
|
"llama-train-text-from-scratch"
|
||||||
];
|
];
|
||||||
mkApp = name: {
|
mkApp = name: {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
|
|
@ -160,9 +160,9 @@ effectiveStdenv.mkDerivation (
|
||||||
};
|
};
|
||||||
|
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
@ -205,18 +205,17 @@ effectiveStdenv.mkDerivation (
|
||||||
|
|
||||||
cmakeFlags =
|
cmakeFlags =
|
||||||
[
|
[
|
||||||
(cmakeBool "LLAMA_NATIVE" false)
|
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
(cmakeBool "LLAMA_BLAS" useBlas)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
(cmakeBool "LLAMA_CUDA" useCuda)
|
(cmakeBool "GGML_CLBLAST" useOpenCL)
|
||||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||||
(cmakeBool "LLAMA_MPI" useMpi)
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||||
(cmakeBool "LLAMA_STATIC" enableStatic)
|
(cmakeBool "GGML_STATIC" enableStatic)
|
||||||
]
|
]
|
||||||
++ optionals useCuda [
|
++ optionals useCuda [
|
||||||
(
|
(
|
||||||
|
@ -232,7 +231,7 @@ effectiveStdenv.mkDerivation (
|
||||||
]
|
]
|
||||||
++ optionals useMetalKit [
|
++ optionals useMetalKit [
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||||
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||||
];
|
];
|
||||||
|
|
||||||
# Environment variables needed for ROCm
|
# Environment variables needed for ROCm
|
||||||
|
@ -244,10 +243,8 @@ effectiveStdenv.mkDerivation (
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
|
|
||||||
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
|
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/llama.h $out/include/
|
cp $src/include/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
||||||
|
@ -295,7 +292,7 @@ effectiveStdenv.mkDerivation (
|
||||||
license = lib.licenses.mit;
|
license = lib.licenses.mit;
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
mainProgram = "llama";
|
mainProgram = "llama-cli";
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
|
|
|
@ -1,29 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "LLAMA_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target server
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/server /server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
|
|
@ -1,25 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
|
||||||
|
|
||||||
RUN make
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
|
|
@ -8,13 +8,13 @@ arg1="$1"
|
||||||
shift
|
shift
|
||||||
|
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
python3 ./convert.py "$@"
|
python3 ./convert-hf-to-gguf.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./quantize "$@"
|
./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./main "$@"
|
./llama-cli "$@"
|
||||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||||
./finetune "$@"
|
./llama-finetune "$@"
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
./server "$@"
|
./llama-server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
|
|
|
@ -12,8 +12,8 @@ build*/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
/main
|
/llama-cli
|
||||||
/quantize
|
/llama-quantize
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
|
@ -26,3 +26,7 @@ indent_size = 2
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
|
[examples/cvector-generator/*.txt]
|
||||||
|
trim_trailing_whitespace = unset
|
||||||
|
insert_final_newline = unset
|
||||||
|
|
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Low Severity Bugs
|
||||||
|
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "low severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Medium Severity Bug
|
||||||
|
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "medium severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: High Severity Bug
|
||||||
|
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "high severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Critical Severity Bug
|
||||||
|
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "critical severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
name: Enhancement
|
||||||
|
description: Used to request enhancements for llama.cpp
|
||||||
|
title: "Feature Request: "
|
||||||
|
labels: ["enhancement"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: prerequisites
|
||||||
|
attributes:
|
||||||
|
label: Prerequisites
|
||||||
|
description: Please confirm the following before submitting your enhancement request.
|
||||||
|
options:
|
||||||
|
- label: I am running the latest code. Mention the version if possible as well.
|
||||||
|
required: true
|
||||||
|
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||||
|
required: true
|
||||||
|
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||||
|
required: true
|
||||||
|
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: feature-description
|
||||||
|
attributes:
|
||||||
|
label: Feature Description
|
||||||
|
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||||
|
placeholder: Detailed description of the enhancement
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: motivation
|
||||||
|
attributes:
|
||||||
|
label: Motivation
|
||||||
|
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||||
|
placeholder: Explanation of why this feature is needed and its benefits
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: possible-implementation
|
||||||
|
attributes:
|
||||||
|
label: Possible Implementation
|
||||||
|
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||||
|
placeholder: Detailed description of potential implementation
|
||||||
|
validations:
|
||||||
|
required: false
|
52
.github/ISSUE_TEMPLATE/06-research.yml
vendored
Normal file
52
.github/ISSUE_TEMPLATE/06-research.yml
vendored
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
name: Research
|
||||||
|
description: Track new technical research area
|
||||||
|
title: "Research: "
|
||||||
|
labels: ["research 🔬"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: research-stage
|
||||||
|
attributes:
|
||||||
|
label: Research Stage
|
||||||
|
description: Track general state of this research ticket
|
||||||
|
options:
|
||||||
|
- label: Background Research (Let's try to avoid reinventing the wheel)
|
||||||
|
- label: Hypothesis Formed (How do you think this will work and it's effect?)
|
||||||
|
- label: Strategy / Implementation Forming
|
||||||
|
- label: Analysis of results
|
||||||
|
- label: Debrief / Documentation (So people in the future can learn from us)
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: background
|
||||||
|
attributes:
|
||||||
|
label: Previous existing literature and research
|
||||||
|
description: Whats the current state of the art and whats the motivation for this research?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: hypothesis
|
||||||
|
attributes:
|
||||||
|
label: Hypothesis
|
||||||
|
description: How do you think this will work and it's effect?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: implementation
|
||||||
|
attributes:
|
||||||
|
label: Implementation
|
||||||
|
description: Got an approach? e.g. a PR ready to go?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: analysis
|
||||||
|
attributes:
|
||||||
|
label: Analysis
|
||||||
|
description: How does the proposed implementation behave?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
name: Refactor (Maintainers)
|
||||||
|
description: Used to track refactoring opportunities
|
||||||
|
title: "Refactor: "
|
||||||
|
labels: ["refactor"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||||
|
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: background-description
|
||||||
|
attributes:
|
||||||
|
label: Background Description
|
||||||
|
description: Please provide a detailed written description of the pain points you are trying to solve.
|
||||||
|
placeholder: Detailed description behind your motivation to request refactor
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: possible-approaches
|
||||||
|
attributes:
|
||||||
|
label: Possible Refactor Approaches
|
||||||
|
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
|
||||||
|
placeholder: Your idea of possible refactoring opportunity/approaches
|
||||||
|
validations:
|
||||||
|
required: false
|
11
.github/ISSUE_TEMPLATE/bug.md
vendored
11
.github/ISSUE_TEMPLATE/bug.md
vendored
|
@ -1,11 +0,0 @@
|
||||||
---
|
|
||||||
name: Bug template
|
|
||||||
about: Used to report bugs in llama.cpp
|
|
||||||
labels: ["bug-unconfirmed"]
|
|
||||||
assignees: ''
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
|
|
||||||
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
|
13
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
13
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
blank_issues_enabled: true
|
||||||
|
contact_links:
|
||||||
|
- name: Got an idea?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
||||||
|
about: Pop it there. It may then become an enhancement ticket.
|
||||||
|
- name: Got a question?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
||||||
|
about: Ask a question there!
|
||||||
|
- name: Want to contribute?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
||||||
|
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||||
|
|
||||||
|
|
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
|
@ -1,28 +0,0 @@
|
||||||
---
|
|
||||||
name: Enhancement template
|
|
||||||
about: Used to request enhancements for llama.cpp
|
|
||||||
labels: ["enhancement"]
|
|
||||||
assignees: ''
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Prerequisites
|
|
||||||
|
|
||||||
Please answer the following questions for yourself before submitting an issue.
|
|
||||||
|
|
||||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
|
||||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
|
||||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
|
||||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
|
||||||
|
|
||||||
# Feature Description
|
|
||||||
|
|
||||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
|
||||||
|
|
||||||
# Motivation
|
|
||||||
|
|
||||||
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
|
||||||
|
|
||||||
# Possible Implementation
|
|
||||||
|
|
||||||
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
|
34
.github/labeler.yml
vendored
34
.github/labeler.yml
vendored
|
@ -1,20 +1,32 @@
|
||||||
# https://github.com/actions/labeler
|
# https://github.com/actions/labeler
|
||||||
|
Kompute:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml/include/ggml-kompute.h
|
||||||
|
- ggml/src/ggml-kompute.cpp
|
||||||
|
- README-kompute.md
|
||||||
|
Apple Metal:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml/include/ggml-metal.h
|
||||||
|
- ggml/src/ggml-metal.cpp
|
||||||
|
- README-metal.md
|
||||||
SYCL:
|
SYCL:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-sycl.h
|
- ggml/include/ggml-sycl.h
|
||||||
- ggml-sycl.cpp
|
- ggml/src/ggml-sycl.cpp
|
||||||
- README-sycl.md
|
- README-sycl.md
|
||||||
Nvidia GPU:
|
Nvidia GPU:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-cuda/**
|
- ggml/include/ggml-cuda.h
|
||||||
|
- ggml/src/ggml-cuda/**
|
||||||
Vulkan:
|
Vulkan:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml_vk_generate_shaders.py
|
- ggml/ggml_vk_generate_shaders.py
|
||||||
- ggml-vulkan*
|
- ggml/src/ggml-vulkan*
|
||||||
documentation:
|
documentation:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
@ -30,7 +42,6 @@ build:
|
||||||
- cmake/**
|
- cmake/**
|
||||||
- CMakeLists.txt
|
- CMakeLists.txt
|
||||||
- CMakePresets.json
|
- CMakePresets.json
|
||||||
- codecov.yml
|
|
||||||
examples:
|
examples:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file: examples/**
|
- any-glob-to-any-file: examples/**
|
||||||
|
@ -62,8 +73,10 @@ server:
|
||||||
ggml:
|
ggml:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-*.c
|
- ggml/include/ggml*.h
|
||||||
- ggml-*.h
|
- ggml/src/ggml*.c
|
||||||
|
- ggml/src/ggml*.cpp
|
||||||
|
- ggml/src/ggml*.h
|
||||||
- ggml-cuda/**
|
- ggml-cuda/**
|
||||||
nix:
|
nix:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
|
@ -71,3 +84,6 @@ nix:
|
||||||
- "**/*.nix"
|
- "**/*.nix"
|
||||||
- .github/workflows/nix-*.yml
|
- .github/workflows/nix-*.yml
|
||||||
- .devops/nix/nixpkgs-instances.nix
|
- .devops/nix/nixpkgs-instances.nix
|
||||||
|
embedding:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file: examples/embedding/
|
||||||
|
|
7
.github/pull_request_template.md
vendored
Normal file
7
.github/pull_request_template.md
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
|
||||||
|
|
||||||
|
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
||||||
|
- Self-reported review complexity:
|
||||||
|
- [ ] Low
|
||||||
|
- [ ] Medium
|
||||||
|
- [ ] High
|
4
.github/workflows/bench.yml
vendored
4
.github/workflows/bench.yml
vendored
|
@ -109,7 +109,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
set -eux
|
set -eux
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DLLAMA_CUBLAS=ON \
|
-DLLAMA_CUBLAS=ON \
|
||||||
|
@ -119,7 +119,7 @@ jobs:
|
||||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
-DLLAMA_FATAL_WARNINGS=OFF \
|
||||||
-DLLAMA_ALL_WARNINGS=OFF \
|
-DLLAMA_ALL_WARNINGS=OFF \
|
||||||
-DCMAKE_BUILD_TYPE=Release;
|
-DCMAKE_BUILD_TYPE=Release;
|
||||||
cmake --build build --config Release -j $(nproc) --target server
|
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Download the dataset
|
- name: Download the dataset
|
||||||
id: download_dataset
|
id: download_dataset
|
||||||
|
|
190
.github/workflows/build.yml
vendored
190
.github/workflows/build.yml
vendored
|
@ -10,10 +10,10 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
@ -47,7 +47,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -84,7 +84,7 @@ jobs:
|
||||||
name: llama-bin-macos-arm64.zip
|
name: llama-bin-macos-arm64.zip
|
||||||
|
|
||||||
macOS-latest-cmake-x64:
|
macOS-latest-cmake-x64:
|
||||||
runs-on: macos-latest
|
runs-on: macos-12
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -103,12 +103,10 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
|
@ -224,7 +222,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -241,8 +239,8 @@ jobs:
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||||
echo "Fetch llama2c model"
|
echo "Fetch llama2c model"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||||
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||||
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -271,49 +269,15 @@ jobs:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
|
||||||
name: llama-bin-ubuntu-x64.zip
|
name: llama-bin-ubuntu-x64.zip
|
||||||
|
|
||||||
# ubuntu-latest-cmake-sanitizer:
|
ubuntu-latest-cmake-sanitizer:
|
||||||
# runs-on: ubuntu-latest
|
|
||||||
#
|
|
||||||
# continue-on-error: true
|
|
||||||
#
|
|
||||||
# strategy:
|
|
||||||
# matrix:
|
|
||||||
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
|
||||||
# build_type: [Debug, Release]
|
|
||||||
#
|
|
||||||
# steps:
|
|
||||||
# - name: Clone
|
|
||||||
# id: checkout
|
|
||||||
# uses: actions/checkout@v4
|
|
||||||
#
|
|
||||||
# - name: Dependencies
|
|
||||||
# id: depends
|
|
||||||
# run: |
|
|
||||||
# sudo apt-get update
|
|
||||||
# sudo apt-get install build-essential
|
|
||||||
#
|
|
||||||
# - name: Build
|
|
||||||
# id: cmake_build
|
|
||||||
# run: |
|
|
||||||
# mkdir build
|
|
||||||
# cd build
|
|
||||||
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
|
||||||
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
|
||||||
#
|
|
||||||
# - name: Test
|
|
||||||
# id: cmake_test
|
|
||||||
# run: |
|
|
||||||
# cd build
|
|
||||||
# ctest -L main --verbose --timeout 900
|
|
||||||
|
|
||||||
ubuntu-latest-cmake-mpi:
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
mpi_library: [mpich, libopenmpi-dev]
|
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
|
build_type: [Debug, Release]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -324,21 +288,31 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential ${{ matrix.mpi_library }}
|
sudo apt-get install build-essential
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_MPI=ON ..
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Build (no OpenMP)
|
||||||
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
|
||||||
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-rpc:
|
ubuntu-latest-cmake-rpc:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -361,7 +335,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_RPC=ON ..
|
cmake -DGGML_RPC=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -389,7 +363,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_VULKAN=ON ..
|
cmake -DGGML_VULKAN=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-hip:
|
ubuntu-22-cmake-hip:
|
||||||
|
@ -410,13 +384,13 @@ jobs:
|
||||||
- name: Build with native CMake HIP support
|
- name: Build with native CMake HIP support
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
|
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Build with legacy HIP support
|
- name: Build with legacy HIP support
|
||||||
id: cmake_build_legacy_hip
|
id: cmake_build_legacy_hip
|
||||||
run: |
|
run: |
|
||||||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
|
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
|
||||||
cmake --build build2 --config Release -j $(nproc)
|
cmake --build build2 --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl:
|
ubuntu-22-cmake-sycl:
|
||||||
|
@ -457,7 +431,7 @@ jobs:
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl-fp16:
|
ubuntu-22-cmake-sycl-fp16:
|
||||||
|
@ -498,10 +472,10 @@ jobs:
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
|
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
|
@ -523,15 +497,15 @@ jobs:
|
||||||
env:
|
env:
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
LLAMA_FATAL_WARNINGS: 1
|
||||||
run: |
|
run: |
|
||||||
LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: make_test
|
id: make_test
|
||||||
run: |
|
run: |
|
||||||
LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
||||||
LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
# TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||||
# would be great if we fix these
|
# would be great if we fix these
|
||||||
|
@ -555,7 +529,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -585,13 +559,14 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
-DLLAMA_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=iOS \
|
-DCMAKE_SYSTEM_NAME=iOS \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
macOS-latest-cmake-tvos:
|
macOS-latest-cmake-tvos:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -614,13 +589,14 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
-DLLAMA_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
macOS-latest-swift:
|
macOS-latest-swift:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -688,7 +664,7 @@ jobs:
|
||||||
- name: Build using make w/ OpenBLAS
|
- name: Build using make w/ OpenBLAS
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
run: |
|
run: |
|
||||||
make LLAMA_OPENBLAS=1 -j $(nproc)
|
make GGML_OPENBLAS=1 -j $(nproc)
|
||||||
|
|
||||||
- name: Build using CMake
|
- name: Build using CMake
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
|
@ -704,16 +680,14 @@ jobs:
|
||||||
- name: Build using CMake w/ OpenBLAS
|
- name: Build using CMake w/ OpenBLAS
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OPENBLAS_VERSION: 0.3.23
|
OPENBLAS_VERSION: 0.3.23
|
||||||
OPENCL_VERSION: 2023.04.17
|
|
||||||
CLBLAST_VERSION: 1.6.0
|
|
||||||
SDE_VERSION: 9.33.0-2024-01-07
|
SDE_VERSION: 9.33.0-2024-01-07
|
||||||
VULKAN_VERSION: 1.3.261.1
|
VULKAN_VERSION: 1.3.261.1
|
||||||
|
|
||||||
|
@ -721,27 +695,25 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'rpc-x64'
|
- build: 'rpc-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'noavx-x64'
|
- build: 'noavx-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx2-x64'
|
- build: 'avx2-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx-x64'
|
- build: 'avx-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512-x64'
|
- build: 'avx512-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'clblast-x64'
|
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute-x64'
|
- build: 'kompute-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'vulkan-x64'
|
- build: 'vulkan-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'llvm-arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -754,28 +726,7 @@ jobs:
|
||||||
id: clone_kompute
|
id: clone_kompute
|
||||||
if: ${{ matrix.build == 'kompute-x64' }}
|
if: ${{ matrix.build == 'kompute-x64' }}
|
||||||
run: |
|
run: |
|
||||||
git submodule update --init kompute
|
git submodule update --init ggml/src/kompute
|
||||||
|
|
||||||
- name: Download OpenCL SDK
|
|
||||||
id: get_opencl
|
|
||||||
if: ${{ matrix.build == 'clblast-x64' }}
|
|
||||||
run: |
|
|
||||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
|
||||||
mkdir $env:RUNNER_TEMP/opencl
|
|
||||||
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
|
|
||||||
|
|
||||||
- name: Download CLBlast
|
|
||||||
id: get_clblast
|
|
||||||
if: ${{ matrix.build == 'clblast-x64' }}
|
|
||||||
run: |
|
|
||||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
|
||||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
|
||||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
|
|
||||||
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
|
|
||||||
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
|
|
||||||
$txt = Get-Content -Path $f -Raw
|
|
||||||
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
|
|
||||||
}
|
|
||||||
|
|
||||||
- name: Download OpenBLAS
|
- name: Download OpenBLAS
|
||||||
id: get_openblas
|
id: get_openblas
|
||||||
|
@ -810,13 +761,6 @@ jobs:
|
||||||
cmake -S . -B build ${{ matrix.defines }}
|
cmake -S . -B build ${{ matrix.defines }}
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
- name: Add clblast.dll
|
|
||||||
id: add_clblast_dll
|
|
||||||
if: ${{ matrix.build == 'clblast-x64' }}
|
|
||||||
run: |
|
|
||||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
|
||||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
|
||||||
|
|
||||||
- name: Add libopenblas.dll
|
- name: Add libopenblas.dll
|
||||||
id: add_libopenblas_dll
|
id: add_libopenblas_dll
|
||||||
if: ${{ matrix.build == 'openblas-x64' }}
|
if: ${{ matrix.build == 'openblas-x64' }}
|
||||||
|
@ -840,7 +784,7 @@ jobs:
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
# not all machines have native AVX-512
|
# not all machines have native AVX-512
|
||||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main -C Release --verbose --timeout 900
|
ctest -L main -C Release --verbose --timeout 900
|
||||||
|
@ -855,6 +799,7 @@ jobs:
|
||||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||||
cd build
|
cd build
|
||||||
|
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
|
@ -885,7 +830,7 @@ jobs:
|
||||||
name: llama-bin-win-${{ matrix.build }}.zip
|
name: llama-bin-win-${{ matrix.build }}.zip
|
||||||
|
|
||||||
windows-latest-cmake-cuda:
|
windows-latest-cmake-cuda:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
@ -899,8 +844,9 @@ jobs:
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- uses: Jimver/cuda-toolkit@v0.2.11
|
- name: Install CUDA toolkit
|
||||||
id: cuda-toolkit
|
id: cuda-toolkit
|
||||||
|
uses: Jimver/cuda-toolkit@v0.2.15
|
||||||
with:
|
with:
|
||||||
cuda: ${{ matrix.cuda }}
|
cuda: ${{ matrix.cuda }}
|
||||||
method: 'network'
|
method: 'network'
|
||||||
|
@ -911,7 +857,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
|
@ -1044,7 +990,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
|
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
|
|
||||||
ios-xcode-build:
|
ios-xcode-build:
|
||||||
|
@ -1095,7 +1041,7 @@ jobs:
|
||||||
# hypervisor: 'qemu'
|
# hypervisor: 'qemu'
|
||||||
# run: |
|
# run: |
|
||||||
# sudo pkg update
|
# sudo pkg update
|
||||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
|
||||||
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
||||||
|
|
||||||
release:
|
release:
|
||||||
|
|
40
.github/workflows/code-coverage.yml
vendored
40
.github/workflows/code-coverage.yml
vendored
|
@ -1,40 +0,0 @@
|
||||||
name: Code Coverage
|
|
||||||
on: [push, pull_request]
|
|
||||||
|
|
||||||
env:
|
|
||||||
GGML_NLOOP: 3
|
|
||||||
GGML_N_THREADS: 1
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
run:
|
|
||||||
runs-on: ubuntu-20.04
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install build-essential gcc-8 lcov
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
|
|
||||||
|
|
||||||
- name: Run tests
|
|
||||||
run: CC=gcc-8 make test
|
|
||||||
|
|
||||||
- name: Generate coverage report
|
|
||||||
run: |
|
|
||||||
make coverage
|
|
||||||
make lcov-report
|
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
|
||||||
uses: codecov/codecov-action@v3
|
|
||||||
env:
|
|
||||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
with:
|
|
||||||
files: lcov-report/coverage.info
|
|
27
.github/workflows/docker.yml
vendored
27
.github/workflows/docker.yml
vendored
|
@ -10,10 +10,11 @@
|
||||||
name: Publish Docker image
|
name: Publish Docker image
|
||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
#pull_request:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
@ -22,7 +23,7 @@ concurrency:
|
||||||
jobs:
|
jobs:
|
||||||
push_to_registry:
|
push_to_registry:
|
||||||
name: Push Docker image to Docker Hub
|
name: Push Docker image to Docker Hub
|
||||||
if: github.event.pull_request.draft == false
|
#if: github.event.pull_request.draft == false
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
|
@ -30,20 +31,18 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
# have disabled them for now until the reason why
|
|
||||||
# is understood.
|
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
5
.github/workflows/labeler.yml
vendored
5
.github/workflows/labeler.yml
vendored
|
@ -9,4 +9,9 @@ jobs:
|
||||||
pull-requests: write
|
pull-requests: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
repository: "ggerganov/llama.cpp"
|
||||||
- uses: actions/labeler@v5
|
- uses: actions/labeler@v5
|
||||||
|
with:
|
||||||
|
configuration-path: '.github/labeler.yml'
|
||||||
|
|
38
.github/workflows/server.yml
vendored
38
.github/workflows/server.yml
vendored
|
@ -16,11 +16,9 @@ on:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
pull_request_target:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
schedule:
|
|
||||||
- cron: '2 4 * * *'
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
|
@ -32,10 +30,8 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
# TODO: temporary disabled due to linux kernel issues
|
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||||
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
build_type: [RelWithDebInfo]
|
||||||
sanitizer: [UNDEFINED]
|
|
||||||
build_type: [Debug]
|
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
sanitizer: ""
|
||||||
|
@ -91,21 +87,33 @@ jobs:
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Build
|
- name: Build (no OpenMP)
|
||||||
id: cmake_build
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DLLAMA_CURL=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DGGML_OPENMP=OFF ;
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
PORT=8888 ./tests.sh
|
PORT=8888 ./tests.sh
|
||||||
|
@ -119,7 +127,7 @@ jobs:
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -142,7 +150,7 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
id: setup_python
|
id: setup_python
|
||||||
|
|
29
.github/workflows/zig-build.yml
vendored
29
.github/workflows/zig-build.yml
vendored
|
@ -1,29 +0,0 @@
|
||||||
name: Zig CI
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build:
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
runs-on: [ubuntu-latest, macos-latest, windows-latest]
|
|
||||||
runs-on: ${{ matrix.runs-on }}
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
fetch-depth: 0
|
|
||||||
- uses: goto-bus-stop/setup-zig@v2
|
|
||||||
with:
|
|
||||||
version: 0.11.0
|
|
||||||
- name: Build Summary
|
|
||||||
run: zig build --summary all -freference-trace
|
|
152
.gitignore
vendored
152
.gitignore
vendored
|
@ -1,126 +1,124 @@
|
||||||
*.o
|
# Extensions
|
||||||
|
|
||||||
*.a
|
*.a
|
||||||
*.so
|
*.bat
|
||||||
|
*.bin
|
||||||
|
*.dll
|
||||||
|
*.dot
|
||||||
|
*.etag
|
||||||
|
*.exe
|
||||||
|
*.gcda
|
||||||
|
*.gcno
|
||||||
|
*.gcov
|
||||||
*.gguf
|
*.gguf
|
||||||
*.gguf.json
|
*.gguf.json
|
||||||
*.bin
|
|
||||||
*.exe
|
|
||||||
*.dll
|
|
||||||
*.log
|
|
||||||
*.gcov
|
|
||||||
*.gcno
|
|
||||||
*.gcda
|
|
||||||
*.dot
|
|
||||||
*.bat
|
|
||||||
*.tmp
|
|
||||||
*.metallib
|
|
||||||
*.etag
|
|
||||||
*.lastModified
|
*.lastModified
|
||||||
.DS_Store
|
*.log
|
||||||
.build/
|
*.metallib
|
||||||
|
*.o
|
||||||
|
*.so
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# IDE / OS
|
||||||
|
|
||||||
.cache/
|
.cache/
|
||||||
.ccls-cache/
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
|
.DS_Store
|
||||||
.envrc
|
.envrc
|
||||||
|
.idea/
|
||||||
.swiftpm
|
.swiftpm
|
||||||
.venv
|
|
||||||
.clang-tidy
|
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
nppBackup
|
||||||
|
|
||||||
ggml-metal-embed.metal
|
|
||||||
|
|
||||||
lcov-report/
|
# Coverage
|
||||||
|
|
||||||
gcovr-report/
|
gcovr-report/
|
||||||
|
lcov-report/
|
||||||
|
|
||||||
|
# Build Artifacts
|
||||||
|
|
||||||
|
tags
|
||||||
|
.build/
|
||||||
build*
|
build*
|
||||||
|
!build-info.cmake
|
||||||
|
!build-info.cpp.in
|
||||||
|
!build-info.sh
|
||||||
!build.zig
|
!build.zig
|
||||||
|
/libllama.so
|
||||||
|
/llama-*
|
||||||
|
android-ndk-*
|
||||||
|
arm_neon.h
|
||||||
cmake-build-*
|
cmake-build-*
|
||||||
|
CMakeSettings.json
|
||||||
|
compile_commands.json
|
||||||
|
ggml-metal-embed.metal
|
||||||
|
llama-batched-swift
|
||||||
|
/rpc-server
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
# CI
|
||||||
|
|
||||||
|
!.github/workflows/*.yml
|
||||||
|
|
||||||
|
# Models
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
models-mnt
|
models-mnt
|
||||||
|
!models/.editorconfig
|
||||||
|
!models/ggml-vocab-*.gguf*
|
||||||
|
|
||||||
/Pipfile
|
# Zig
|
||||||
/baby-llama
|
|
||||||
/beam-search
|
|
||||||
/benchmark-matmult
|
|
||||||
/convert-llama2c-to-ggml
|
|
||||||
/embd-input-test
|
|
||||||
/embedding
|
|
||||||
/eval-callback
|
|
||||||
/gguf
|
|
||||||
/gguf-llama-simple
|
|
||||||
/gguf-split
|
|
||||||
/gritlm
|
|
||||||
/imatrix
|
|
||||||
/infill
|
|
||||||
/libllama.so
|
|
||||||
/llama-bench
|
|
||||||
/llava-cli
|
|
||||||
/lookahead
|
|
||||||
/lookup
|
|
||||||
/lookup-create
|
|
||||||
/lookup-merge
|
|
||||||
/lookup-stats
|
|
||||||
/main
|
|
||||||
/metal
|
|
||||||
/passkey
|
|
||||||
/perplexity
|
|
||||||
/q8dot
|
|
||||||
/quantize
|
|
||||||
/quantize-stats
|
|
||||||
/result
|
|
||||||
/save-load-state
|
|
||||||
/server
|
|
||||||
/simple
|
|
||||||
/batched
|
|
||||||
/batched-bench
|
|
||||||
/export-lora
|
|
||||||
/finetune
|
|
||||||
/retrieval
|
|
||||||
/speculative
|
|
||||||
/parallel
|
|
||||||
/train-text-from-scratch
|
|
||||||
/tokenize
|
|
||||||
/vdot
|
|
||||||
/common/build-info.cpp
|
|
||||||
arm_neon.h
|
|
||||||
compile_commands.json
|
|
||||||
CMakeSettings.json
|
|
||||||
|
|
||||||
__pycache__
|
|
||||||
dist
|
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
|
||||||
ppl-*.txt
|
ppl-*.txt
|
||||||
qnt-*.txt
|
qnt-*.txt
|
||||||
perf-*.txt
|
perf-*.txt
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
|
examples/server/*.css.hpp
|
||||||
examples/server/*.html.hpp
|
examples/server/*.html.hpp
|
||||||
examples/server/*.js.hpp
|
examples/server/*.js.hpp
|
||||||
examples/server/*.mjs.hpp
|
examples/server/*.mjs.hpp
|
||||||
|
!build_64.sh
|
||||||
|
!examples/*.bat
|
||||||
|
!examples/*/*.kts
|
||||||
|
!examples/*/*/*.kts
|
||||||
|
!examples/sycl/*.bat
|
||||||
|
!examples/sycl/*.sh
|
||||||
|
|
||||||
|
# Python
|
||||||
|
|
||||||
|
__pycache__
|
||||||
|
.venv
|
||||||
|
/Pipfile
|
||||||
|
dist
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
nppBackup
|
|
||||||
|
|
||||||
# Test binaries
|
# Test binaries
|
||||||
/tests/test-grammar-parser
|
/tests/test-backend-ops
|
||||||
/tests/test-llama-grammar
|
|
||||||
/tests/test-double-float
|
/tests/test-double-float
|
||||||
/tests/test-grad0
|
/tests/test-grad0
|
||||||
|
/tests/test-grammar-parser
|
||||||
|
/tests/test-llama-grammar
|
||||||
/tests/test-opt
|
/tests/test-opt
|
||||||
/tests/test-quantize-fns
|
/tests/test-quantize-fns
|
||||||
/tests/test-quantize-perf
|
/tests/test-quantize-perf
|
||||||
|
/tests/test-rope
|
||||||
/tests/test-sampling
|
/tests/test-sampling
|
||||||
/tests/test-tokenizer-0
|
/tests/test-tokenizer-0
|
||||||
/tests/test-tokenizer-1-spm
|
|
||||||
/tests/test-tokenizer-1-bpe
|
/tests/test-tokenizer-1-bpe
|
||||||
/tests/test-rope
|
/tests/test-tokenizer-1-spm
|
||||||
/tests/test-backend-ops
|
|
||||||
|
# Scripts
|
||||||
|
!/scripts/install-oneapi.bat
|
||||||
|
|
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
||||||
[submodule "kompute"]
|
[submodule "kompute"]
|
||||||
path = kompute
|
path = ggml/src/kompute
|
||||||
url = https://github.com/nomic-ai/kompute.git
|
url = https://github.com/nomic-ai/kompute.git
|
||||||
|
|
129
AUTHORS
129
AUTHORS
|
@ -1,8 +1,9 @@
|
||||||
# date: Tue Apr 9 09:17:14 EEST 2024
|
# date: Wed Jun 26 19:36:34 EEST 2024
|
||||||
# this file is auto-generated by scripts/gen-authors.sh
|
# this file is auto-generated by scripts/gen-authors.sh
|
||||||
|
|
||||||
0cc4m <picard12@live.de>
|
0cc4m <picard12@live.de>
|
||||||
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
||||||
|
20kdc <asdd2808@gmail.com>
|
||||||
2f38b454 <dxf@protonmail.com>
|
2f38b454 <dxf@protonmail.com>
|
||||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||||
44670 <44670@users.noreply.github.com>
|
44670 <44670@users.noreply.github.com>
|
||||||
|
@ -11,14 +12,18 @@ AT <manyoso@users.noreply.github.com>
|
||||||
Aarni Koskela <akx@iki.fi>
|
Aarni Koskela <akx@iki.fi>
|
||||||
Aaron Miller <apage43@ninjawhale.com>
|
Aaron Miller <apage43@ninjawhale.com>
|
||||||
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
||||||
|
Abheek Gulati <abheekg@hotmail.com>
|
||||||
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||||
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
||||||
Adithya Balaji <adithya.b94@gmail.com>
|
Adithya Balaji <adithya.b94@gmail.com>
|
||||||
AdithyanI <adithyan.i4internet@gmail.com>
|
AdithyanI <adithyan.i4internet@gmail.com>
|
||||||
Adrian <smith.adriane@gmail.com>
|
Adrian <smith.adriane@gmail.com>
|
||||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||||
|
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
||||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||||
Aisuko <urakiny@gmail.com>
|
Aisuko <urakiny@gmail.com>
|
||||||
|
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
||||||
|
Albert Jin <albert.jin@gmail.com>
|
||||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||||
Alex <awhill19@icloud.com>
|
Alex <awhill19@icloud.com>
|
||||||
Alex Azarov <alex@azarov.by>
|
Alex Azarov <alex@azarov.by>
|
||||||
|
@ -35,19 +40,24 @@ Ali Nehzat <ali.nehzat@thanks.dev>
|
||||||
Ali Tariq <ali.tariq@10xengineers.ai>
|
Ali Tariq <ali.tariq@10xengineers.ai>
|
||||||
Alon <alonfaraj@gmail.com>
|
Alon <alonfaraj@gmail.com>
|
||||||
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
||||||
|
Amir <amir_zia@outlook.com>
|
||||||
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||||
Ananta Bastola <anantarajbastola@gmail.com>
|
Ananta Bastola <anantarajbastola@gmail.com>
|
||||||
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||||
András Salamon <ott2@users.noreply.github.com>
|
András Salamon <ott2@users.noreply.github.com>
|
||||||
Andrei <abetlen@gmail.com>
|
Andrei <abetlen@gmail.com>
|
||||||
Andrew Canis <andrew.canis@gmail.com>
|
Andrew Canis <andrew.canis@gmail.com>
|
||||||
|
Andrew Downing <andrew2085@gmail.com>
|
||||||
Andrew Duffy <a10y@users.noreply.github.com>
|
Andrew Duffy <a10y@users.noreply.github.com>
|
||||||
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||||
|
Andy Tai <andy-tai@users.noreply.github.com>
|
||||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||||
Artem <guinmoon@gmail.com>
|
Artem <guinmoon@gmail.com>
|
||||||
|
Artem Zinnatullin <ceo@abstractny.gay>
|
||||||
Artyom Lebedev <vagran.ast@gmail.com>
|
Artyom Lebedev <vagran.ast@gmail.com>
|
||||||
Asbjørn Olling <asbjornolling@gmail.com>
|
Asbjørn Olling <asbjornolling@gmail.com>
|
||||||
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||||
|
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
||||||
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||||
Ashraful Islam <ashraful.meche@gmail.com>
|
Ashraful Islam <ashraful.meche@gmail.com>
|
||||||
Atsushi Tatsuma <yoshoku@outlook.com>
|
Atsushi Tatsuma <yoshoku@outlook.com>
|
||||||
|
@ -57,35 +67,46 @@ BADR <contact@pythops.com>
|
||||||
Bach Le <bach@bullno1.com>
|
Bach Le <bach@bullno1.com>
|
||||||
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
||||||
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
||||||
|
Bartowski <ckealty1182@gmail.com>
|
||||||
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
||||||
|
Ben Ashbaugh <ben.ashbaugh@intel.com>
|
||||||
Ben Garney <bengarney@users.noreply.github.com>
|
Ben Garney <bengarney@users.noreply.github.com>
|
||||||
Ben Siraphob <bensiraphob@gmail.com>
|
Ben Siraphob <bensiraphob@gmail.com>
|
||||||
Ben Williams <ben@719ben.com>
|
Ben Williams <ben@719ben.com>
|
||||||
|
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
||||||
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||||
Bernat Vadell <hounter.caza@gmail.com>
|
Bernat Vadell <hounter.caza@gmail.com>
|
||||||
|
Bingan <70050083+binganao@users.noreply.github.com>
|
||||||
Bodo Graumann <mail@bodograumann.de>
|
Bodo Graumann <mail@bodograumann.de>
|
||||||
Bono Lv <lvscar@users.noreply.github.com>
|
Bono Lv <lvscar@users.noreply.github.com>
|
||||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||||
Branden Butler <bwtbutler@hotmail.com>
|
Branden Butler <bwtbutler@hotmail.com>
|
||||||
Brian <mofosyne@gmail.com>
|
Brian <mofosyne@gmail.com>
|
||||||
Bruce MacDonald <brucewmacdonald@gmail.com>
|
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
|
Bryan Honof <bryanhonof@gmail.com>
|
||||||
CJ Pais <cj@cjpais.com>
|
CJ Pais <cj@cjpais.com>
|
||||||
CRD716 <crd716@gmail.com>
|
CRD716 <crd716@gmail.com>
|
||||||
|
Calvin Laurenson <calvin@laurenson.dev>
|
||||||
Cameron <csteele@steelecameron.com>
|
Cameron <csteele@steelecameron.com>
|
||||||
Cameron Kaiser <classilla@users.noreply.github.com>
|
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||||
|
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
||||||
Casey Primozic <casey@cprimozic.net>
|
Casey Primozic <casey@cprimozic.net>
|
||||||
Casey Primozic <me@ameo.link>
|
Casey Primozic <me@ameo.link>
|
||||||
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||||
Cebtenzzre <cebtenzzre@gmail.com>
|
Cebtenzzre <cebtenzzre@gmail.com>
|
||||||
Chad Brewbaker <crb002@gmail.com>
|
Chad Brewbaker <crb002@gmail.com>
|
||||||
|
Chao Jiang <jc19chaoj@zoho.com>
|
||||||
Cheng Shao <terrorjack@type.dance>
|
Cheng Shao <terrorjack@type.dance>
|
||||||
|
Chris Elrod <elrodc@gmail.com>
|
||||||
Chris Kuehl <ckuehl@ckuehl.me>
|
Chris Kuehl <ckuehl@ckuehl.me>
|
||||||
Christian Demsar <christian@github.email.demsar.us>
|
Christian Demsar <christian@github.email.demsar.us>
|
||||||
Christian Demsar <crasm@git.vczf.us>
|
Christian Demsar <crasm@git.vczf.us>
|
||||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||||
Christian Kögler <ck3d@gmx.de>
|
Christian Kögler <ck3d@gmx.de>
|
||||||
|
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
||||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||||
Clint Herron <hanclinto@gmail.com>
|
Clint Herron <hanclinto@gmail.com>
|
||||||
|
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
||||||
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||||
DAN™ <dranger003@gmail.com>
|
DAN™ <dranger003@gmail.com>
|
||||||
Damian Stewart <d@damianstewart.com>
|
Damian Stewart <d@damianstewart.com>
|
||||||
|
@ -95,8 +116,12 @@ Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||||
Daniel Drake <drake@endlessos.org>
|
Daniel Drake <drake@endlessos.org>
|
||||||
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||||
|
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
||||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||||
|
Dave <dave-fl@users.noreply.github.com>
|
||||||
|
Dave Airlie <airlied@gmail.com>
|
||||||
|
Dave Airlie <airlied@redhat.com>
|
||||||
Dave Della Costa <ddellacosta+github@gmail.com>
|
Dave Della Costa <ddellacosta+github@gmail.com>
|
||||||
David Friehs <david@friehs.info>
|
David Friehs <david@friehs.info>
|
||||||
David Kennedy <dakennedyd@gmail.com>
|
David Kennedy <dakennedyd@gmail.com>
|
||||||
|
@ -104,10 +129,13 @@ David Pflug <david@pflug.email>
|
||||||
David Renshaw <dwrenshaw@gmail.com>
|
David Renshaw <dwrenshaw@gmail.com>
|
||||||
David Sommers <12738+databyte@users.noreply.github.com>
|
David Sommers <12738+databyte@users.noreply.github.com>
|
||||||
David Yang <davidyang6us@gmail.com>
|
David Yang <davidyang6us@gmail.com>
|
||||||
|
Dawid Potocki <github@dawidpotocki.com>
|
||||||
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||||
Dean <Dean.Sinaean@gmail.com>
|
Dean <Dean.Sinaean@gmail.com>
|
||||||
Deins <deinsegle@gmail.com>
|
Deins <deinsegle@gmail.com>
|
||||||
|
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
||||||
Didzis Gosko <didzis@users.noreply.github.com>
|
Didzis Gosko <didzis@users.noreply.github.com>
|
||||||
|
Djip007 <djip.perois@free.fr>
|
||||||
Don Mahurin <dmahurin@users.noreply.github.com>
|
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||||
DooWoong Lee (David) <manics99@naver.com>
|
DooWoong Lee (David) <manics99@naver.com>
|
||||||
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||||
|
@ -116,8 +144,11 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||||
Ebey Abraham <ebey97@gmail.com>
|
Ebey Abraham <ebey97@gmail.com>
|
||||||
Ed Lee <edilee@mozilla.com>
|
Ed Lee <edilee@mozilla.com>
|
||||||
Ed Lepedus <ed.lepedus@googlemail.com>
|
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||||
|
Eddie-Wang <wangjinheng1120@163.com>
|
||||||
Edward Taylor <edeetee@gmail.com>
|
Edward Taylor <edeetee@gmail.com>
|
||||||
|
Elaine <elaine.zosa@gmail.com>
|
||||||
Elbios <141279586+Elbios@users.noreply.github.com>
|
Elbios <141279586+Elbios@users.noreply.github.com>
|
||||||
|
Elton Kola <eltonkola@gmail.com>
|
||||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||||
Equim <sayaka@ekyu.moe>
|
Equim <sayaka@ekyu.moe>
|
||||||
Eric Sommerlade <es0m@users.noreply.github.com>
|
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||||
|
@ -143,37 +174,47 @@ Firat <firatkiral@gmail.com>
|
||||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||||
|
Frank Mai <thxcode0824@gmail.com>
|
||||||
FrankHB <frankhb1989@gmail.com>
|
FrankHB <frankhb1989@gmail.com>
|
||||||
|
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
||||||
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||||
Gabe Goodhart <gabe.l.hart@gmail.com>
|
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||||
GainLee <perfecter.gen@gmail.com>
|
GainLee <perfecter.gen@gmail.com>
|
||||||
Galunid <karolek1231456@gmail.com>
|
Galunid <karolek1231456@gmail.com>
|
||||||
Gary Linscott <glinscott@gmail.com>
|
Gary Linscott <glinscott@gmail.com>
|
||||||
Gary Mulder <gjmulder@gmail.com>
|
Gary Mulder <gjmulder@gmail.com>
|
||||||
|
Gavin Zhao <gavinzhaojw@protonmail.com>
|
||||||
Genkagaku.GPT <hlhr202@163.com>
|
Genkagaku.GPT <hlhr202@163.com>
|
||||||
Georgi Gerganov <ggerganov@gmail.com>
|
Georgi Gerganov <ggerganov@gmail.com>
|
||||||
Gilad S <giladgd@users.noreply.github.com>
|
Gilad S <giladgd@users.noreply.github.com>
|
||||||
|
Giuseppe Scrivano <giuseppe@scrivano.org>
|
||||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||||
Govlzkoy <gotope@users.noreply.github.com>
|
Govlzkoy <gotope@users.noreply.github.com>
|
||||||
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||||
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||||
|
Haggai Nuchi <h.nuchi@gmail.com>
|
||||||
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||||
|
Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
|
||||||
|
HanishKVC <hanishkvc@gmail.com>
|
||||||
Haohui Mai <ricetons@gmail.com>
|
Haohui Mai <ricetons@gmail.com>
|
||||||
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||||
Harald Fernengel <harald.fernengel@here.com>
|
Harald Fernengel <harald.fernengel@here.com>
|
||||||
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
||||||
|
HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
|
||||||
Henk Poley <HenkPoley@gmail.com>
|
Henk Poley <HenkPoley@gmail.com>
|
||||||
Henri Vasserman <henv@hot.ee>
|
Henri Vasserman <henv@hot.ee>
|
||||||
Henrik Forstén <henrik.forsten@gmail.com>
|
Henrik Forstén <henrik.forsten@gmail.com>
|
||||||
Herman Semenov <GermanAizek@yandex.ru>
|
Herman Semenov <GermanAizek@yandex.ru>
|
||||||
Hesen Peng <hesen.peng@gmail.com>
|
Hesen Peng <hesen.peng@gmail.com>
|
||||||
Hoang Nguyen <hugo53@users.noreply.github.com>
|
Hoang Nguyen <hugo53@users.noreply.github.com>
|
||||||
|
Hong Bo PENG <penghb@cn.ibm.com>
|
||||||
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||||
Howard Su <howard0su@gmail.com>
|
Howard Su <howard0su@gmail.com>
|
||||||
Hua Jiang <allenhjiang@outlook.com>
|
Hua Jiang <allenhjiang@outlook.com>
|
||||||
Huawei Lin <huaweilin.cs@gmail.com>
|
Huawei Lin <huaweilin.cs@gmail.com>
|
||||||
|
Hugo Roussel <hugo.rous@gmail.com>
|
||||||
Ian Bull <irbull@eclipsesource.com>
|
Ian Bull <irbull@eclipsesource.com>
|
||||||
Ian Bull <irbull@gmail.com>
|
Ian Bull <irbull@gmail.com>
|
||||||
Ian Scrivener <github@zilogy.asia>
|
Ian Scrivener <github@zilogy.asia>
|
||||||
|
@ -190,8 +231,10 @@ Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||||
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||||
|
Jaemin Son <woalsdnd@gmail.com>
|
||||||
Jag Chadha <jagtesh@gmail.com>
|
Jag Chadha <jagtesh@gmail.com>
|
||||||
Jakub N <jakubniemczyk97@gmail.com>
|
Jakub N <jakubniemczyk97@gmail.com>
|
||||||
|
James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
|
||||||
James Reynolds <magnusviri@users.noreply.github.com>
|
James Reynolds <magnusviri@users.noreply.github.com>
|
||||||
Jan Boon <jan.boon@kaetemi.be>
|
Jan Boon <jan.boon@kaetemi.be>
|
||||||
Jan Boon <kaetemi@gmail.com>
|
Jan Boon <kaetemi@gmail.com>
|
||||||
|
@ -205,12 +248,17 @@ Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||||
Jed Fox <git@jedfox.com>
|
Jed Fox <git@jedfox.com>
|
||||||
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||||
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||||
|
Jeximo <jeximo@gmail.com>
|
||||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||||
Jiahao Li <liplus17@163.com>
|
Jiahao Li <liplus17@163.com>
|
||||||
Jian Liao <jianliao@users.noreply.github.com>
|
Jian Liao <jianliao@users.noreply.github.com>
|
||||||
JidongZhang-THU <1119708529@qq.com>
|
JidongZhang-THU <1119708529@qq.com>
|
||||||
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
||||||
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||||
|
Jiří Sejkora <Sejseloid@gmail.com>
|
||||||
|
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
||||||
|
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
||||||
|
Johan <JohanAR@users.noreply.github.com>
|
||||||
Johannes Gäßler <johannesg@5d6.de>
|
Johannes Gäßler <johannesg@5d6.de>
|
||||||
Johannes Rudolph <johannes.rudolph@gmail.com>
|
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||||
John <78893154+cmp-nct@users.noreply.github.com>
|
John <78893154+cmp-nct@users.noreply.github.com>
|
||||||
|
@ -221,15 +269,19 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
|
||||||
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
||||||
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
||||||
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
||||||
|
Josh Ramer <josh.ramer@icloud.com>
|
||||||
Joyce <joycebrum@google.com>
|
Joyce <joycebrum@google.com>
|
||||||
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||||
Judd <foldl@users.noreply.github.com>
|
Judd <foldl@users.noreply.github.com>
|
||||||
Julius Arkenberg <arki05@users.noreply.github.com>
|
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||||
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||||
|
Junyang Lin <justinlin930319@hotmail.com>
|
||||||
Juraj Bednar <juraj@bednar.io>
|
Juraj Bednar <juraj@bednar.io>
|
||||||
Justin Parker <jparkerweb@gmail.com>
|
Justin Parker <jparkerweb@gmail.com>
|
||||||
Justin Suess <justin.suess@westpoint.edu>
|
Justin Suess <justin.suess@westpoint.edu>
|
||||||
|
Justina Cho <justcho5@gmail.com>
|
||||||
Justine Tunney <jtunney@gmail.com>
|
Justine Tunney <jtunney@gmail.com>
|
||||||
|
Justine Tunney <jtunney@mozilla.com>
|
||||||
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||||
KASR <karim.asrih@gmail.com>
|
KASR <karim.asrih@gmail.com>
|
||||||
Kamil Tomšík <info@tomsik.cz>
|
Kamil Tomšík <info@tomsik.cz>
|
||||||
|
@ -242,6 +294,7 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||||
Keiichi Tabata <keiichi.tabata@outlook.com>
|
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||||
Kenvix ⭐ <kenvixzure@live.com>
|
Kenvix ⭐ <kenvixzure@live.com>
|
||||||
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||||
|
Kevin Gibbons <bakkot@gmail.com>
|
||||||
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||||
Kevin Kwok <antimatter15@gmail.com>
|
Kevin Kwok <antimatter15@gmail.com>
|
||||||
Kevin Lo <kevlo@kevlo.org>
|
Kevin Lo <kevlo@kevlo.org>
|
||||||
|
@ -257,6 +310,7 @@ Laura <Tijntje_7@msn.com>
|
||||||
Lee <44310445+lx200916@users.noreply.github.com>
|
Lee <44310445+lx200916@users.noreply.github.com>
|
||||||
Lee Drake <b.lee.drake@gmail.com>
|
Lee Drake <b.lee.drake@gmail.com>
|
||||||
Leng Yue <lengyue@lengyue.me>
|
Leng Yue <lengyue@lengyue.me>
|
||||||
|
Leon Knauer <git@leonknauer.com>
|
||||||
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||||
Leonardo Neumann <leonardo@neumann.dev.br>
|
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||||
Li Tan <tanliboy@gmail.com>
|
Li Tan <tanliboy@gmail.com>
|
||||||
|
@ -265,20 +319,26 @@ LoganDark <github@logandark.mozmail.com>
|
||||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||||
Luciano <lucianostrika44@gmail.com>
|
Luciano <lucianostrika44@gmail.com>
|
||||||
Luo Tian <lt@basecity.com>
|
Luo Tian <lt@basecity.com>
|
||||||
|
Lyle Dean <dean@lyle.dev>
|
||||||
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||||
Maarten ter Huurne <maarten@treewalker.org>
|
Maarten ter Huurne <maarten@treewalker.org>
|
||||||
Mack Straight <eiz@users.noreply.github.com>
|
Mack Straight <eiz@users.noreply.github.com>
|
||||||
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||||
MaggotHATE <clay1326@gmail.com>
|
MaggotHATE <clay1326@gmail.com>
|
||||||
|
Manuel <44313466+makuche@users.noreply.github.com>
|
||||||
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||||
Marco Matthies <71844+marcom@users.noreply.github.com>
|
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||||
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||||
Marian Cepok <marian.cepok@gmail.com>
|
Marian Cepok <marian.cepok@gmail.com>
|
||||||
Mark Fairbairn <thebaron88@gmail.com>
|
Mark Fairbairn <thebaron88@gmail.com>
|
||||||
Marko Tasic <mtasic85@gmail.com>
|
Marko Tasic <mtasic85@gmail.com>
|
||||||
|
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
||||||
|
Martin Delille <martin@delille.org>
|
||||||
Martin Krasser <krasserm@googlemail.com>
|
Martin Krasser <krasserm@googlemail.com>
|
||||||
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
||||||
Marvin Gießing <marvin.giessing@gmail.com>
|
Marvin Gießing <marvin.giessing@gmail.com>
|
||||||
|
Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
|
||||||
|
MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
|
||||||
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||||
Matheus C. França <matheus-catarino@hotmail.com>
|
Matheus C. França <matheus-catarino@hotmail.com>
|
||||||
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||||
|
@ -287,8 +347,11 @@ Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||||
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||||
Matt Pulver <matt.pulver@heavy.ai>
|
Matt Pulver <matt.pulver@heavy.ai>
|
||||||
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||||
|
Mattheus Chediak <shammcity00@gmail.com>
|
||||||
Matthew Tejo <matthew.tejo@gmail.com>
|
Matthew Tejo <matthew.tejo@gmail.com>
|
||||||
Matvey Soloviev <blackhole89@gmail.com>
|
Matvey Soloviev <blackhole89@gmail.com>
|
||||||
|
Max Krasnyansky <max.krasnyansky@gmail.com>
|
||||||
|
Max Krasnyansky <quic_maxk@quicinc.com>
|
||||||
Maxime <672982+maximegmd@users.noreply.github.com>
|
Maxime <672982+maximegmd@users.noreply.github.com>
|
||||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||||
Meng Zhang <meng@tabbyml.com>
|
Meng Zhang <meng@tabbyml.com>
|
||||||
|
@ -300,32 +363,41 @@ Michael Kesper <mkesper@schokokeks.org>
|
||||||
Michael Klimenko <mklimenko29@gmail.com>
|
Michael Klimenko <mklimenko29@gmail.com>
|
||||||
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||||
Michael Potter <NanoTekGuy@Gmail.com>
|
Michael Potter <NanoTekGuy@Gmail.com>
|
||||||
|
Michael de Gans <michael.john.degans@gmail.com>
|
||||||
Michaël de Vries <vriesdemichael@gmail.com>
|
Michaël de Vries <vriesdemichael@gmail.com>
|
||||||
Mihai <mihai.chirculescu@yahoo.com>
|
Mihai <mihai.chirculescu@yahoo.com>
|
||||||
Mike <ytianhui2004@gmail.com>
|
Mike <ytianhui2004@gmail.com>
|
||||||
|
Mikko Juola <mikjuo@gmail.com>
|
||||||
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||||
Mirko185 <mirkosig@gmail.com>
|
Mirko185 <mirkosig@gmail.com>
|
||||||
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||||
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||||
|
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
||||||
Murilo Santana <mvrilo@gmail.com>
|
Murilo Santana <mvrilo@gmail.com>
|
||||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||||
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||||
|
Nathan Epstein <nate2@umbc.edu>
|
||||||
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||||
Nebula <infinitewormhole@gmail.com>
|
Nebula <infinitewormhole@gmail.com>
|
||||||
|
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
||||||
|
Neo Zhang <zhang.jianyu@outlook.com>
|
||||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||||
Neuman Vong <neuman.vong@gmail.com>
|
Neuman Vong <neuman.vong@gmail.com>
|
||||||
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||||
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||||
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||||
|
Nicolás Pérez <nicolas_perez@brown.edu>
|
||||||
Nigel Bosch <pnigelb@gmail.com>
|
Nigel Bosch <pnigelb@gmail.com>
|
||||||
Niklas Korz <niklas@niklaskorz.de>
|
Niklas Korz <niklas@niklaskorz.de>
|
||||||
|
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
||||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||||
Ondřej Čertík <ondrej@certik.us>
|
Ondřej Čertík <ondrej@certik.us>
|
||||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||||
|
Patrice Ferlet <metal3d@gmail.com>
|
||||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||||
Pavol Rusnak <pavol@rusnak.io>
|
Pavol Rusnak <pavol@rusnak.io>
|
||||||
Pedro Cuenca <pedro@huggingface.co>
|
Pedro Cuenca <pedro@huggingface.co>
|
||||||
|
@ -343,9 +415,14 @@ RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||||
Radoslav Gerganov <rgerganov@gmail.com>
|
Radoslav Gerganov <rgerganov@gmail.com>
|
||||||
Radosław Gryta <radek.gryta@gmail.com>
|
Radosław Gryta <radek.gryta@gmail.com>
|
||||||
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
||||||
|
Raj Hammeer Singh Hada <hammeerraj@gmail.com>
|
||||||
|
Ralph Soika <ralph.soika@imixs.com>
|
||||||
Rand Xie <randxiexyy29@gmail.com>
|
Rand Xie <randxiexyy29@gmail.com>
|
||||||
Randall Fitzgerald <randall@dasaku.net>
|
Randall Fitzgerald <randall@dasaku.net>
|
||||||
Reinforce-II <fate@eastal.com>
|
Reinforce-II <fate@eastal.com>
|
||||||
|
Ren Xuancheng <jklj077@users.noreply.github.com>
|
||||||
|
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
||||||
|
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
||||||
Riceball LEE <snowyu.lee@gmail.com>
|
Riceball LEE <snowyu.lee@gmail.com>
|
||||||
Richard Kiss <him@richardkiss.com>
|
Richard Kiss <him@richardkiss.com>
|
||||||
Richard Roberson <richardr1126@gmail.com>
|
Richard Roberson <richardr1126@gmail.com>
|
||||||
|
@ -373,6 +450,7 @@ Rowan Hart <rowanbhart@gmail.com>
|
||||||
Rune <43761327+Rune-AI@users.noreply.github.com>
|
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||||
Ryan Landay <rlanday@gmail.com>
|
Ryan Landay <rlanday@gmail.com>
|
||||||
Ryder Wishart <ryderwishart@gmail.com>
|
Ryder Wishart <ryderwishart@gmail.com>
|
||||||
|
Ryuei <louixs@users.noreply.github.com>
|
||||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||||
SakuraUmi <yukinon244@gmail.com>
|
SakuraUmi <yukinon244@gmail.com>
|
||||||
Salvador E. Tropea <stropea@inti.gob.ar>
|
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||||
|
@ -386,6 +464,7 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||||
Senemu <10880819+Senemu@users.noreply.github.com>
|
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||||
Sergey Alirzaev <zl29ah@gmail.com>
|
Sergey Alirzaev <zl29ah@gmail.com>
|
||||||
Sergio López <slp@sinrega.org>
|
Sergio López <slp@sinrega.org>
|
||||||
|
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
||||||
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||||
ShadovvBeast <ShadovvBeast@gmail.com>
|
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||||
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||||
|
@ -394,6 +473,7 @@ Shijie <821898965@qq.com>
|
||||||
Shintarou Okada <kokuzen@gmail.com>
|
Shintarou Okada <kokuzen@gmail.com>
|
||||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||||
Shouzheng Liu <lshzh.hi@gmail.com>
|
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
|
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
||||||
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||||
Simon Willison <swillison@gmail.com>
|
Simon Willison <swillison@gmail.com>
|
||||||
Siwen Yu <yusiwen@gmail.com>
|
Siwen Yu <yusiwen@gmail.com>
|
||||||
|
@ -405,11 +485,14 @@ Someone <sergei.kozlukov@aalto.fi>
|
||||||
Someone Serge <sergei.kozlukov@aalto.fi>
|
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||||
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
||||||
Spencer Sutton <spencersutton@users.noreply.github.com>
|
Spencer Sutton <spencersutton@users.noreply.github.com>
|
||||||
|
Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
|
||||||
Srinivas Billa <nivibilla@gmail.com>
|
Srinivas Billa <nivibilla@gmail.com>
|
||||||
Stefan Sydow <stefan@sydow.email>
|
Stefan Sydow <stefan@sydow.email>
|
||||||
|
Steffen Röcker <sroecker@gmail.com>
|
||||||
Stephan Walter <stephan@walter.name>
|
Stephan Walter <stephan@walter.name>
|
||||||
Stephen Nichols <snichols@users.noreply.github.com>
|
Stephen Nichols <snichols@users.noreply.github.com>
|
||||||
Steve Grubb <ausearch.1@gmail.com>
|
Steve Grubb <ausearch.1@gmail.com>
|
||||||
|
Steven Prichard <spprichard20@gmail.com>
|
||||||
Steven Roussey <sroussey@gmail.com>
|
Steven Roussey <sroussey@gmail.com>
|
||||||
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||||
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||||
|
@ -434,16 +517,19 @@ Tom C <tom.corelis@gmail.com>
|
||||||
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||||
Tomas <tom.tomas.36478119@gmail.com>
|
Tomas <tom.tomas.36478119@gmail.com>
|
||||||
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||||
|
Tristan Druyen <tristan@vault81.mozmail.com>
|
||||||
Tristan Ross <rosscomputerguy@protonmail.com>
|
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||||
Tungsten842 <886724vf@anonaddy.me>
|
Tungsten842 <886724vf@anonaddy.me>
|
||||||
Tungsten842 <quantmint@protonmail.com>
|
Tungsten842 <quantmint@protonmail.com>
|
||||||
Tushar <ditsuke@protonmail.com>
|
Tushar <ditsuke@protonmail.com>
|
||||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||||
|
Ulrich Drepper <drepper@gmail.com>
|
||||||
Uzo Nweke <uzoechi@gmail.com>
|
Uzo Nweke <uzoechi@gmail.com>
|
||||||
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||||
Val Kharitonov <mail@kharvd.com>
|
Val Kharitonov <mail@kharvd.com>
|
||||||
Valentin Konovalov <valle.ketsujin@gmail.com>
|
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||||
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||||
|
Victor Nogueira <felladrin@gmail.com>
|
||||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||||
Vlad <spitfireage@gmail.com>
|
Vlad <spitfireage@gmail.com>
|
||||||
Vladimir <bogdad@gmail.com>
|
Vladimir <bogdad@gmail.com>
|
||||||
|
@ -455,7 +541,9 @@ Weird Constructor <weirdconstructor@gmail.com>
|
||||||
Welby Seely <welbyseely@gmail.com>
|
Welby Seely <welbyseely@gmail.com>
|
||||||
Wentai Zhang <rchardx@gmail.com>
|
Wentai Zhang <rchardx@gmail.com>
|
||||||
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||||
|
William Tambellini <william.tambellini@gmail.com>
|
||||||
Willy Tarreau <w@1wt.eu>
|
Willy Tarreau <w@1wt.eu>
|
||||||
|
Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
|
||||||
Wu Jian Ping <wujjpp@hotmail.com>
|
Wu Jian Ping <wujjpp@hotmail.com>
|
||||||
Wu Jian Ping <wujp@greatld.com>
|
Wu Jian Ping <wujp@greatld.com>
|
||||||
Xiake Sun <xiake.sun@intel.com>
|
Xiake Sun <xiake.sun@intel.com>
|
||||||
|
@ -466,6 +554,8 @@ Xiaoyi Chen <cxychina@gmail.com>
|
||||||
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||||
Xuan Son Nguyen <thichthat@gmail.com>
|
Xuan Son Nguyen <thichthat@gmail.com>
|
||||||
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||||
|
Yaroslav <yaroslav.yashin@me.com>
|
||||||
|
Yazan Agha-Schrader <mountaiin@icloud.com>
|
||||||
Yiming Cui <conandiy@vip.qq.com>
|
Yiming Cui <conandiy@vip.qq.com>
|
||||||
Yishuo Wang <MeouSker77@outlook.com>
|
Yishuo Wang <MeouSker77@outlook.com>
|
||||||
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||||
|
@ -477,6 +567,7 @@ Zane Shannon <z@zcs.me>
|
||||||
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||||
Zenix <zenixls2@gmail.com>
|
Zenix <zenixls2@gmail.com>
|
||||||
Zhang Peiyuan <a1286225768@gmail.com>
|
Zhang Peiyuan <a1286225768@gmail.com>
|
||||||
|
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
||||||
ZhouYuChen <zhouyuchen@naver.com>
|
ZhouYuChen <zhouyuchen@naver.com>
|
||||||
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||||
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||||
|
@ -484,14 +575,18 @@ Zsapi <martin1.zsapka@gmail.com>
|
||||||
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
||||||
adel boussaken <netdur@gmail.com>
|
adel boussaken <netdur@gmail.com>
|
||||||
afrideva <95653597+afrideva@users.noreply.github.com>
|
afrideva <95653597+afrideva@users.noreply.github.com>
|
||||||
|
agray3 <agray3@users.noreply.github.com>
|
||||||
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
||||||
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||||
alonfaraj <alonfaraj@gmail.com>
|
alonfaraj <alonfaraj@gmail.com>
|
||||||
|
alwqx <kenan3015@gmail.com>
|
||||||
|
amd-lalithnc <lalithnc@amd.com>
|
||||||
andrijdavid <david@geek.mg>
|
andrijdavid <david@geek.mg>
|
||||||
anon998 <131767832+anon998@users.noreply.github.com>
|
anon998 <131767832+anon998@users.noreply.github.com>
|
||||||
anzz1 <anzz1@live.com>
|
anzz1 <anzz1@live.com>
|
||||||
apaz <aarpazdera@gmail.com>
|
apaz <aarpazdera@gmail.com>
|
||||||
apcameron <37645737+apcameron@users.noreply.github.com>
|
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||||
|
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
||||||
arcrank <arcrank@gmail.com>
|
arcrank <arcrank@gmail.com>
|
||||||
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||||
at8u <129688334+at8u@users.noreply.github.com>
|
at8u <129688334+at8u@users.noreply.github.com>
|
||||||
|
@ -514,13 +609,17 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||||
coezbek <c.oezbek@gmail.com>
|
coezbek <c.oezbek@gmail.com>
|
||||||
comex <comexk@gmail.com>
|
comex <comexk@gmail.com>
|
||||||
compilade <113953597+compilade@users.noreply.github.com>
|
compilade <113953597+compilade@users.noreply.github.com>
|
||||||
|
compilade <git@compilade.net>
|
||||||
|
cpumaxx <163466046+cpumaxx@users.noreply.github.com>
|
||||||
crasm <crasm@git.vczf.net>
|
crasm <crasm@git.vczf.net>
|
||||||
crasm <crasm@git.vczf.us>
|
crasm <crasm@git.vczf.us>
|
||||||
daboe01 <daboe01@googlemail.com>
|
daboe01 <daboe01@googlemail.com>
|
||||||
david raistrick <keen99@users.noreply.github.com>
|
david raistrick <keen99@users.noreply.github.com>
|
||||||
|
ddh0 <dylanhalladay02@icloud.com>
|
||||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||||
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||||
divinity76 <divinity76@gmail.com>
|
divinity76 <divinity76@gmail.com>
|
||||||
|
dm4 <sunrisedm4@gmail.com>
|
||||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||||
drbh <david.richard.holtz@gmail.com>
|
drbh <david.richard.holtz@gmail.com>
|
||||||
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
||||||
|
@ -529,6 +628,7 @@ eastriver <lee@eastriver.dev>
|
||||||
ebraminio <ebraminio@gmail.com>
|
ebraminio <ebraminio@gmail.com>
|
||||||
eiery <19350831+eiery@users.noreply.github.com>
|
eiery <19350831+eiery@users.noreply.github.com>
|
||||||
eric8607242 <e0928021388@gmail.com>
|
eric8607242 <e0928021388@gmail.com>
|
||||||
|
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
||||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||||
gliptic <gliptic@users.noreply.github.com>
|
gliptic <gliptic@users.noreply.github.com>
|
||||||
|
@ -539,6 +639,7 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||||
hankcs <cnhankmc@gmail.com>
|
hankcs <cnhankmc@gmail.com>
|
||||||
hoangmit <hoangmit@users.noreply.github.com>
|
hoangmit <hoangmit@users.noreply.github.com>
|
||||||
hongbo.mo <352280764@qq.com>
|
hongbo.mo <352280764@qq.com>
|
||||||
|
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
||||||
howlger <eclipse@voormann.de>
|
howlger <eclipse@voormann.de>
|
||||||
howlger <github@voormann.de>
|
howlger <github@voormann.de>
|
||||||
hutli <6594598+hutli@users.noreply.github.com>
|
hutli <6594598+hutli@users.noreply.github.com>
|
||||||
|
@ -549,14 +650,22 @@ hydai <z54981220@gmail.com>
|
||||||
iSma <ismail.senhaji@gmail.com>
|
iSma <ismail.senhaji@gmail.com>
|
||||||
iacore <74560659+iacore@users.noreply.github.com>
|
iacore <74560659+iacore@users.noreply.github.com>
|
||||||
igarnier <igarnier@protonmail.com>
|
igarnier <igarnier@protonmail.com>
|
||||||
|
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
||||||
iohub <rickyang.pro@gmail.com>
|
iohub <rickyang.pro@gmail.com>
|
||||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||||
|
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
||||||
jameswu2014 <545426914@qq.com>
|
jameswu2014 <545426914@qq.com>
|
||||||
|
jiez <373447296@qq.com>
|
||||||
jneem <joeneeman@gmail.com>
|
jneem <joeneeman@gmail.com>
|
||||||
|
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
||||||
johnson442 <56517414+johnson442@users.noreply.github.com>
|
johnson442 <56517414+johnson442@users.noreply.github.com>
|
||||||
|
jojorne <jojorne@users.noreply.github.com>
|
||||||
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
||||||
jp-x-g <jpxg-dev@protonmail.com>
|
jp-x-g <jpxg-dev@protonmail.com>
|
||||||
|
jukofyork <69222624+jukofyork@users.noreply.github.com>
|
||||||
|
junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
|
||||||
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||||
|
k.h.lai <adrian.k.h.lai@outlook.com>
|
||||||
kaizau <kaizau@users.noreply.github.com>
|
kaizau <kaizau@users.noreply.github.com>
|
||||||
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
||||||
kang <tpdns9032100@gmail.com>
|
kang <tpdns9032100@gmail.com>
|
||||||
|
@ -575,11 +684,15 @@ ldwang <ftgreat@163.com>
|
||||||
le.chang <cljs118@126.com>
|
le.chang <cljs118@126.com>
|
||||||
leejet <leejet714@gmail.com>
|
leejet <leejet714@gmail.com>
|
||||||
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||||
|
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
||||||
lon <114724657+longregen@users.noreply.github.com>
|
lon <114724657+longregen@users.noreply.github.com>
|
||||||
|
loonerin <132926317+loonerin@users.noreply.github.com>
|
||||||
|
luoyu-intel <yu.luo@intel.com>
|
||||||
m3ndax <adrian.goessl@outlook.com>
|
m3ndax <adrian.goessl@outlook.com>
|
||||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||||
makomk <makosoft@googlemail.com>
|
makomk <makosoft@googlemail.com>
|
||||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||||
|
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
||||||
mdrokz <mohammadmunshi@gmail.com>
|
mdrokz <mohammadmunshi@gmail.com>
|
||||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||||
minarchist <minarchist@users.noreply.github.com>
|
minarchist <minarchist@users.noreply.github.com>
|
||||||
|
@ -593,15 +706,19 @@ ngc92 <7938269+ngc92@users.noreply.github.com>
|
||||||
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
||||||
niansa/tuxifan <anton-sa@web.de>
|
niansa/tuxifan <anton-sa@web.de>
|
||||||
niansa/tuxifan <tuxifan@posteo.de>
|
niansa/tuxifan <tuxifan@posteo.de>
|
||||||
|
nickp27 <nb.porter@gmail.com>
|
||||||
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
||||||
nold <Nold360@users.noreply.github.com>
|
nold <Nold360@users.noreply.github.com>
|
||||||
nopperl <54780682+nopperl@users.noreply.github.com>
|
nopperl <54780682+nopperl@users.noreply.github.com>
|
||||||
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
||||||
olexiyb <olexiyb@gmail.com>
|
olexiyb <olexiyb@gmail.com>
|
||||||
|
omahs <73983677+omahs@users.noreply.github.com>
|
||||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||||
opparco <parco.opaai@gmail.com>
|
opparco <parco.opaai@gmail.com>
|
||||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||||
|
pengxin99 <pengxin.yuan@intel.com>
|
||||||
perserk <perserk@gmail.com>
|
perserk <perserk@gmail.com>
|
||||||
|
pmysl <piotr.myslinski@outlook.com>
|
||||||
postmasters <namnguyen@google.com>
|
postmasters <namnguyen@google.com>
|
||||||
pudepiedj <pudepiedj@gmail.com>
|
pudepiedj <pudepiedj@gmail.com>
|
||||||
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
||||||
|
@ -614,16 +731,19 @@ rhuddleston <ryan.huddleston@percona.com>
|
||||||
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
||||||
runfuture <runfuture@users.noreply.github.com>
|
runfuture <runfuture@users.noreply.github.com>
|
||||||
sandyiscool <sandyiscool@gmail.com>
|
sandyiscool <sandyiscool@gmail.com>
|
||||||
|
sasha0552 <admin@sasha0552.org>
|
||||||
semidark <me@semidark.net>
|
semidark <me@semidark.net>
|
||||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||||
shibe2 <shibe@tuta.io>
|
shibe2 <shibe@tuta.io>
|
||||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||||
sjinzh <sjinzh@gmail.com>
|
sjinzh <sjinzh@gmail.com>
|
||||||
|
sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
|
||||||
slaren <2141330+slaren@users.noreply.github.com>
|
slaren <2141330+slaren@users.noreply.github.com>
|
||||||
slaren <slarengh@gmail.com>
|
slaren <slarengh@gmail.com>
|
||||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||||
staviq <staviq@gmail.com>
|
staviq <staviq@gmail.com>
|
||||||
stduhpf <stephduh@live.fr>
|
stduhpf <stephduh@live.fr>
|
||||||
|
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
||||||
swittk <switt1995@gmail.com>
|
swittk <switt1995@gmail.com>
|
||||||
takov751 <40316768+takov751@users.noreply.github.com>
|
takov751 <40316768+takov751@users.noreply.github.com>
|
||||||
tarcey <cey.tarik@gmail.com>
|
tarcey <cey.tarik@gmail.com>
|
||||||
|
@ -636,12 +756,16 @@ uint256_t <konndennsa@gmail.com>
|
||||||
uint256_t <maekawatoshiki1017@gmail.com>
|
uint256_t <maekawatoshiki1017@gmail.com>
|
||||||
unbounded <haakon@likedan.net>
|
unbounded <haakon@likedan.net>
|
||||||
valiray <133289098+valiray@users.noreply.github.com>
|
valiray <133289098+valiray@users.noreply.github.com>
|
||||||
|
vik <vikhyatk@gmail.com>
|
||||||
|
viric <viric@viric.name>
|
||||||
vodkaslime <646329483@qq.com>
|
vodkaslime <646329483@qq.com>
|
||||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||||
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||||
whoreson <139810751+whoreson@users.noreply.github.com>
|
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||||
|
woachk <24752637+woachk@users.noreply.github.com>
|
||||||
wonjun Jang <strutive07@gmail.com>
|
wonjun Jang <strutive07@gmail.com>
|
||||||
|
woodx <124784234+woodx9@users.noreply.github.com>
|
||||||
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||||
xaedes <xaedes@gmail.com>
|
xaedes <xaedes@gmail.com>
|
||||||
xaedes <xaedes@googlemail.com>
|
xaedes <xaedes@googlemail.com>
|
||||||
|
@ -649,7 +773,10 @@ xloem <0xloem@gmail.com>
|
||||||
yangli2 <yangli2@gmail.com>
|
yangli2 <yangli2@gmail.com>
|
||||||
yuiseki <yuiseki@gmail.com>
|
yuiseki <yuiseki@gmail.com>
|
||||||
zakkor <edward.partenie@gmail.com>
|
zakkor <edward.partenie@gmail.com>
|
||||||
|
zhangkaihuo <zhangkaihuo@gmail.com>
|
||||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||||
|
zhouwg <zhouwg2000@gmail.com>
|
||||||
zrm <trustiosity.zrm@gmail.com>
|
zrm <trustiosity.zrm@gmail.com>
|
||||||
|
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
||||||
源文雨 <41315874+fumiama@users.noreply.github.com>
|
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||||
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
||||||
|
|
1306
CMakeLists.txt
1306
CMakeLists.txt
File diff suppressed because it is too large
Load diff
|
@ -1,4 +1,4 @@
|
||||||
{
|
{
|
||||||
"version": 4,
|
"version": 4,
|
||||||
"configurePresets": [
|
"configurePresets": [
|
||||||
{
|
{
|
||||||
|
@ -11,10 +11,22 @@
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "sycl-base",
|
||||||
|
"hidden": true,
|
||||||
|
"generator": "Ninja",
|
||||||
|
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
|
"CMAKE_CXX_COMPILER": "icx",
|
||||||
|
"GGML_SYCL": "ON",
|
||||||
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
|
}
|
||||||
|
},
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
|
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
"name": "arm64-windows-msvc", "hidden": true,
|
||||||
|
@ -35,11 +47,18 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||||
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
14
CONTRIBUTING.md
Normal file
14
CONTRIBUTING.md
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# Contributing Guidelines
|
||||||
|
|
||||||
|
## Checklist
|
||||||
|
|
||||||
|
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
|
||||||
|
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||||
|
* Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
|
|
||||||
|
## PR formatting
|
||||||
|
|
||||||
|
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||||
|
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
|
||||||
|
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
|
||||||
|
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
|
|
@ -3,14 +3,13 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
var sources = [
|
var sources = [
|
||||||
"ggml.c",
|
"src/llama.cpp",
|
||||||
"sgemm.cpp",
|
"src/unicode.cpp",
|
||||||
"llama.cpp",
|
"src/unicode-data.cpp",
|
||||||
"unicode.cpp",
|
"ggml/src/ggml.c",
|
||||||
"unicode-data.cpp",
|
"ggml/src/ggml-alloc.c",
|
||||||
"ggml-alloc.c",
|
"ggml/src/ggml-backend.c",
|
||||||
"ggml-backend.c",
|
"ggml/src/ggml-quants.c",
|
||||||
"ggml-quants.c",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
var resources: [Resource] = []
|
var resources: [Resource] = []
|
||||||
|
@ -26,8 +25,8 @@ var cSettings: [CSetting] = [
|
||||||
]
|
]
|
||||||
|
|
||||||
#if canImport(Darwin)
|
#if canImport(Darwin)
|
||||||
sources.append("ggml-metal.m")
|
sources.append("ggml/src/ggml-metal.m")
|
||||||
resources.append(.process("ggml-metal.metal"))
|
resources.append(.process("ggml/src/ggml-metal.metal"))
|
||||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||||
cSettings.append(
|
cSettings.append(
|
||||||
contentsOf: [
|
contentsOf: [
|
||||||
|
@ -63,8 +62,6 @@ let package = Package(
|
||||||
"models",
|
"models",
|
||||||
"tests",
|
"tests",
|
||||||
"CMakeLists.txt",
|
"CMakeLists.txt",
|
||||||
"ggml-cuda.cu",
|
|
||||||
"ggml-cuda.h",
|
|
||||||
"Makefile"
|
"Makefile"
|
||||||
],
|
],
|
||||||
sources: sources,
|
sources: sources,
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# llama.cpp for SYCL
|
# llama.cpp for SYCL
|
||||||
|
|
||||||
- [Background](#background)
|
- [Background](#background)
|
||||||
|
- [Recommended Release](#recommended-release)
|
||||||
- [News](#news)
|
- [News](#news)
|
||||||
- [OS](#os)
|
- [OS](#os)
|
||||||
- [Hardware](#hardware)
|
- [Hardware](#hardware)
|
||||||
|
@ -29,10 +30,25 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
|
||||||
|
|
||||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
||||||
|
|
||||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||||
|
|
||||||
|
## Recommended Release
|
||||||
|
|
||||||
|
The SYCL backend would be broken by some PRs due to no online CI.
|
||||||
|
|
||||||
|
The following release is verified with good quality:
|
||||||
|
|
||||||
|
|Commit ID|Tag|Release|Verified Platform|
|
||||||
|
|-|-|-|-|
|
||||||
|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
||||||
|
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.5
|
||||||
|
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||||
|
- Arch Linux is verified successfully.
|
||||||
|
|
||||||
- 2024.4
|
- 2024.4
|
||||||
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
|
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
|
||||||
|
|
||||||
|
@ -55,8 +71,8 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
## OS
|
## OS
|
||||||
|
|
||||||
| OS | Status | Verified |
|
| OS | Status | Verified |
|
||||||
|---------|---------|------------------------------------|
|
|---------|---------|------------------------------------------------|
|
||||||
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
|
||||||
| Windows | Support | Windows 11 |
|
| Windows | Support | Windows 11 |
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,14 +86,14 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
|-------------------------------|---------|---------------------------------------|
|
|-------------------------------|---------|---------------------------------------|
|
||||||
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
||||||
| Intel Data Center Flex Series | Support | Flex 170 |
|
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||||
| Intel Arc Series | Support | Arc 770, 730M |
|
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
|
||||||
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
||||||
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
|
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- **Memory**
|
- **Memory**
|
||||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
|
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||||
|
|
||||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||||
|
|
||||||
|
@ -99,14 +115,14 @@ The docker build option is currently limited to *intel GPU* targets.
|
||||||
### Build image
|
### Build image
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
|
||||||
|
|
||||||
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
|
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||||
|
|
||||||
### Run container
|
### Run container
|
||||||
|
|
||||||
|
@ -228,10 +244,10 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
# Option 2: Use FP16
|
# Option 2: Use FP16
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
|
@ -248,10 +264,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
# Option 2: Use FP16
|
# Option 2: Use FP16
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
|
@ -275,7 +291,7 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./build/bin/ls-sycl-device
|
./build/bin/llama-ls-sycl-device
|
||||||
```
|
```
|
||||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
||||||
```
|
```
|
||||||
|
@ -313,7 +329,7 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||||
```
|
```
|
||||||
or run by script:
|
or run by script:
|
||||||
|
|
||||||
|
@ -324,7 +340,7 @@ or run by script:
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
Otherwise, you can run the script:
|
Otherwise, you can run the script:
|
||||||
|
@ -394,15 +410,9 @@ Output (example):
|
||||||
|
|
||||||
4. Install build tools
|
4. Install build tools
|
||||||
|
|
||||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||||
|
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||||
|
|
||||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
|
||||||
|
|
||||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
|
||||||
|
|
||||||
- Extract `w64devkit` on your pc.
|
|
||||||
|
|
||||||
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
|
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build llama.cpp
|
||||||
|
|
||||||
|
@ -412,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
# Option 2: Or FP16
|
# Option 2: Or FP16
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
cmake --build build --config Release -j
|
cmake --build build --config Release -j
|
||||||
```
|
```
|
||||||
|
@ -425,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
||||||
.\examples\sycl\win-build-sycl.bat
|
.\examples\sycl\win-build-sycl.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Or, use CMake presets to build:
|
||||||
|
```sh
|
||||||
|
cmake --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake --preset x64-windows-sycl-debug
|
||||||
|
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||||
|
```
|
||||||
|
|
||||||
|
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
|
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
|
@ -488,13 +512,13 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||||
```
|
```
|
||||||
Otherwise, run the following wrapper script:
|
Otherwise, run the following wrapper script:
|
||||||
|
|
||||||
|
@ -520,9 +544,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|--------------------|-----------------------------------|---------------------------------------------|
|
|--------------------|-----------------------------------|---------------------------------------------|
|
||||||
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
||||||
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
||||||
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||||
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
|
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
|
||||||
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||||
|
|
||||||
|
|
364
README.md
364
README.md
|
@ -2,14 +2,20 @@
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT) [](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||||
|
[](https://conan.io/center/llama-cpp)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||||
|
|
||||||
### Recent API changes
|
### Recent API changes
|
||||||
|
|
||||||
|
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
|
||||||
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
||||||
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
||||||
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
||||||
|
@ -20,7 +26,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
|
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
|
||||||
|
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||||
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
||||||
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
||||||
|
@ -50,7 +57,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
<li><a href="#quantization">Quantization</a></li>
|
<li><a href="#quantization">Quantization</a></li>
|
||||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||||
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
||||||
<li><a href="#instruct-mode">Instruct mode</a></li>
|
|
||||||
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
|
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
|
||||||
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
|
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
|
||||||
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
|
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
|
||||||
|
@ -74,7 +80,7 @@ variety of hardware - locally and in the cloud.
|
||||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
||||||
- Vulkan, SYCL, and (partial) OpenCL backend support
|
- Vulkan and SYCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
||||||
|
@ -107,7 +113,6 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
||||||
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
||||||
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
||||||
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
|
|
||||||
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
||||||
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
||||||
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
|
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
|
||||||
|
@ -128,6 +133,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
- [x] [OLMo](https://allenai.org/olmo)
|
- [x] [OLMo](https://allenai.org/olmo)
|
||||||
|
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||||
|
|
||||||
|
@ -141,11 +147,14 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||||
|
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||||
|
|
||||||
**HTTP server**
|
**HTTP server**
|
||||||
|
|
||||||
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||||
|
|
||||||
|
[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||||
|
@ -187,6 +196,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||||
|
- [RAGNA Desktop](https://ragna.app/) (proprietary)
|
||||||
- [RecurseChat](https://recurse.chat/) (proprietary)
|
- [RecurseChat](https://recurse.chat/) (proprietary)
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
|
@ -199,15 +209,21 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
|
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
|
**Tools:**
|
||||||
|
|
||||||
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
|
@ -301,7 +317,7 @@ cd llama.cpp
|
||||||
|
|
||||||
### Build
|
### Build
|
||||||
|
|
||||||
In order to build llama.cpp you have three different options.
|
In order to build llama.cpp you have four different options.
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
- On Linux or MacOS:
|
- On Linux or MacOS:
|
||||||
|
@ -310,8 +326,6 @@ In order to build llama.cpp you have three different options.
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
|
|
||||||
|
|
||||||
- On Windows:
|
- On Windows:
|
||||||
|
|
||||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||||
|
@ -323,6 +337,11 @@ In order to build llama.cpp you have three different options.
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Notes:
|
||||||
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
||||||
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
|
- For debug builds, run `make LLAMA_DEBUG=1`
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -330,33 +349,26 @@ In order to build llama.cpp you have three different options.
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: for `Debug` builds, there are two cases:
|
**Notes**:
|
||||||
|
|
||||||
- Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||||
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
|
- For debug builds, there are two cases:
|
||||||
|
|
||||||
|
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DCMAKE_BUILD_TYPE=Debug
|
cmake -B build -DCMAKE_BUILD_TYPE=Debug
|
||||||
cmake --build build
|
cmake --build build
|
||||||
```
|
```
|
||||||
|
|
||||||
- Multi-config generators (`-G` param set to Visual Studio, XCode...):
|
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -G "Xcode"
|
cmake -B build -G "Xcode"
|
||||||
cmake --build build --config Debug
|
cmake --build build --config Debug
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `Zig` (version 0.11 or later):
|
|
||||||
|
|
||||||
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
|
|
||||||
it's also possible to cross compile for other operating systems and architectures:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
|
|
||||||
```
|
|
||||||
|
|
||||||
The `zig targets` command will give you valid options to use.
|
|
||||||
|
|
||||||
- Using `gmake` (FreeBSD):
|
- Using `gmake` (FreeBSD):
|
||||||
|
|
||||||
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
|
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
|
||||||
|
@ -364,66 +376,54 @@ In order to build llama.cpp you have three different options.
|
||||||
3. Install compilation dependencies.
|
3. Install compilation dependencies.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
|
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
|
||||||
opencl clblast openblas
|
|
||||||
|
|
||||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
||||||
```
|
```
|
||||||
|
|
||||||
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
|
### Homebrew
|
||||||
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
|
|
||||||
the instructions for use and activate this options in this document below.
|
On Mac and Linux, the homebrew package manager can be used via
|
||||||
|
```
|
||||||
|
brew install llama.cpp
|
||||||
|
```
|
||||||
|
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
|
||||||
|
|
||||||
|
### Nix
|
||||||
|
|
||||||
|
On Mac and Linux, the Nix package manager can be used via
|
||||||
|
```
|
||||||
|
nix profile install nixpkgs#llama-cpp
|
||||||
|
```
|
||||||
|
For flake enabled installs.
|
||||||
|
|
||||||
|
Or
|
||||||
|
```
|
||||||
|
nix-env --file '<nixpkgs>' --install --attr llama-cpp
|
||||||
|
```
|
||||||
|
For non-flake enabled installs.
|
||||||
|
|
||||||
|
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
||||||
|
|
||||||
|
#### Flox
|
||||||
|
|
||||||
|
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
|
||||||
|
```
|
||||||
|
flox install llama-cpp
|
||||||
|
```
|
||||||
|
Flox follows the nixpkgs build of llama.cpp.
|
||||||
|
|
||||||
### Metal Build
|
### Metal Build
|
||||||
|
|
||||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||||
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
|
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
|
||||||
|
|
||||||
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
||||||
argument.
|
argument.
|
||||||
|
|
||||||
### MPI Build
|
|
||||||
|
|
||||||
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
|
|
||||||
|
|
||||||
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
|
|
||||||
|
|
||||||
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
|
|
||||||
|
|
||||||
- Using `make`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
|
|
||||||
```
|
|
||||||
|
|
||||||
- Using `CMake`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cmake -S . -B build -DLLAMA_MPI=ON
|
|
||||||
```
|
|
||||||
|
|
||||||
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
|
|
||||||
|
|
||||||
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
|
|
||||||
|
|
||||||
Here is an example hostfile:
|
|
||||||
|
|
||||||
```
|
|
||||||
192.168.0.1:2
|
|
||||||
malvolio.local:1
|
|
||||||
```
|
|
||||||
|
|
||||||
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
|
|
||||||
|
|
||||||
Finally, you're ready to run a computation using `mpirun`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
|
||||||
```
|
|
||||||
|
|
||||||
### BLAS Build
|
### BLAS Build
|
||||||
|
|
||||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
|
||||||
|
|
||||||
- #### Accelerate Framework:
|
- #### Accelerate Framework:
|
||||||
|
|
||||||
|
@ -436,7 +436,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
- On Linux:
|
- On Linux:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_OPENBLAS=1
|
make GGML_OPENBLAS=1
|
||||||
```
|
```
|
||||||
|
|
||||||
- On Windows:
|
- On Windows:
|
||||||
|
@ -451,13 +451,13 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
8. From here you can run:
|
8. From here you can run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_OPENBLAS=1
|
make GGML_OPENBLAS=1
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake` on Linux:
|
- Using `CMake` on Linux:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -476,10 +476,10 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
|
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
|
||||||
|
|
||||||
- Using manual oneAPI installation:
|
- Using manual oneAPI installation:
|
||||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
||||||
```bash
|
```bash
|
||||||
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
|
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -496,25 +496,28 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_CUDA=1
|
make GGML_CUDA=1
|
||||||
```
|
```
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_CUDA=ON
|
cmake -B build -DGGML_CUDA=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
|
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
|
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
||||||
|
|
||||||
- #### hipBLAS
|
- #### hipBLAS
|
||||||
|
|
||||||
|
@ -524,15 +527,15 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_HIPBLAS=1
|
make GGML_HIPBLAS=1
|
||||||
```
|
```
|
||||||
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||||
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build --config Release -- -j 16
|
&& cmake --build build --config Release -- -j 16
|
||||||
```
|
```
|
||||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
||||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||||
|
|
||||||
Note that if you get the following error:
|
Note that if you get the following error:
|
||||||
|
@ -546,19 +549,19 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
||||||
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
||||||
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build -- -j 16
|
&& cmake --build build -- -j 16
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||||
```bash
|
```bash
|
||||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
set PATH=%HIP_PATH%\bin;%PATH%
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
||||||
cmake --build build
|
cmake --build build
|
||||||
```
|
```
|
||||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
|
@ -570,115 +573,10 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### CLBlast
|
|
||||||
|
|
||||||
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
|
||||||
|
|
||||||
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
|
|
||||||
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
|
|
||||||
|
|
||||||
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Installing the OpenCL SDK from source</summary>
|
|
||||||
|
|
||||||
```sh
|
|
||||||
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
|
|
||||||
cd OpenCL-SDK
|
|
||||||
cmake -B build -DBUILD_DOCS=OFF \
|
|
||||||
-DBUILD_EXAMPLES=OFF \
|
|
||||||
-DBUILD_TESTING=OFF \
|
|
||||||
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
|
|
||||||
-DOPENCL_SDK_TEST_SAMPLES=OFF
|
|
||||||
cmake --build build
|
|
||||||
cmake --install build --prefix /some/path
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
##### Installing CLBlast
|
|
||||||
|
|
||||||
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
|
|
||||||
|
|
||||||
Linux packaging:
|
|
||||||
Fedora Linux:
|
|
||||||
```bash
|
|
||||||
sudo dnf install clblast
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively, they may be built from source.
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Windows:</summary>
|
|
||||||
|
|
||||||
```cmd
|
|
||||||
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
|
|
||||||
git clone https://github.com/CNugteren/CLBlast.git
|
|
||||||
cd CLBlast
|
|
||||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
|
|
||||||
cmake --build build --config Release
|
|
||||||
cmake --install build --prefix C:/CLBlast
|
|
||||||
```
|
|
||||||
|
|
||||||
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Unix:</summary>
|
|
||||||
|
|
||||||
```sh
|
|
||||||
git clone https://github.com/CNugteren/CLBlast.git
|
|
||||||
cd CLBlast
|
|
||||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
|
||||||
cmake --build build --config Release
|
|
||||||
cmake --install build --prefix /some/path
|
|
||||||
```
|
|
||||||
|
|
||||||
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
|
|
||||||
</details>
|
|
||||||
|
|
||||||
##### Building Llama with CLBlast
|
|
||||||
|
|
||||||
- Build with make:
|
|
||||||
```sh
|
|
||||||
make LLAMA_CLBLAST=1
|
|
||||||
```
|
|
||||||
- CMake (Unix):
|
|
||||||
```sh
|
|
||||||
cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
|
|
||||||
cmake --build build --config Release
|
|
||||||
```
|
|
||||||
- CMake (Windows):
|
|
||||||
```cmd
|
|
||||||
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
|
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
|
||||||
cd llama.cpp
|
|
||||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
|
|
||||||
cmake --build build --config Release
|
|
||||||
cmake --install build --prefix C:/LlamaCPP
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Running Llama with CLBlast
|
|
||||||
|
|
||||||
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
|
|
||||||
|
|
||||||
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
|
|
||||||
The selection can be a number (starting from 0) or a text string to search:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
GGML_OPENCL_PLATFORM=1 ./main ...
|
|
||||||
GGML_OPENCL_DEVICE=2 ./main ...
|
|
||||||
GGML_OPENCL_PLATFORM=Intel ./main ...
|
|
||||||
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
|
|
||||||
```
|
|
||||||
|
|
||||||
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
|
|
||||||
Using the variables it is possible to select a CPU-based driver as well, if so desired.
|
|
||||||
|
|
||||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
|
||||||
|
|
||||||
- #### Vulkan
|
- #### Vulkan
|
||||||
|
|
||||||
|
@ -688,7 +586,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Build the image
|
# Build the image
|
||||||
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
|
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
|
||||||
|
|
||||||
# Then, use it:
|
# Then, use it:
|
||||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
@ -709,15 +607,17 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
vulkaninfo
|
vulkaninfo
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
Alternatively your package manager might be able to provide the appropriate libraries.
|
||||||
|
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||||
|
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||||
|
|
||||||
Then, build llama.cpp using the cmake command below:
|
Then, build llama.cpp using the cmake command below:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_VULKAN=1
|
cmake -B build -DGGML_VULKAN=1
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||||
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||||
|
|
||||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||||
|
@ -730,7 +630,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||||
|
|
||||||
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
||||||
|
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# obtain the official LLaMA model weights and place them in ./models
|
# obtain the official LLaMA model weights and place them in ./models
|
||||||
|
@ -747,23 +648,20 @@ ls ./models
|
||||||
python3 -m pip install -r requirements.txt
|
python3 -m pip install -r requirements.txt
|
||||||
|
|
||||||
# convert the model to ggml FP16 format
|
# convert the model to ggml FP16 format
|
||||||
python3 convert.py models/mymodel/
|
python3 convert-hf-to-gguf.py models/mymodel/
|
||||||
|
|
||||||
# [Optional] for models using BPE tokenizers
|
|
||||||
python convert.py models/mymodel/ --vocab-type bpe
|
|
||||||
|
|
||||||
# quantize the model to 4-bits (using Q4_K_M method)
|
# quantize the model to 4-bits (using Q4_K_M method)
|
||||||
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
|
|
||||||
# update the gguf filetype to current version if older version is now unsupported
|
# update the gguf filetype to current version if older version is now unsupported
|
||||||
./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
||||||
```
|
```
|
||||||
|
|
||||||
### Run the quantized model
|
### Run the quantized model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# start inference on a gguf model
|
# start inference on a gguf model
|
||||||
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
@ -838,7 +736,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
|
||||||
#### How to run
|
#### How to run
|
||||||
|
|
||||||
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||||
3. Output:
|
3. Output:
|
||||||
```
|
```
|
||||||
perplexity : calculating perplexity over 655 chunks
|
perplexity : calculating perplexity over 655 chunks
|
||||||
|
@ -862,16 +760,16 @@ Here is an example of a few-shot interaction, invoked with the command
|
||||||
./examples/chat-13B.sh
|
./examples/chat-13B.sh
|
||||||
|
|
||||||
# custom arguments using a 13B model
|
# custom arguments using a 13B model
|
||||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Persistent Interaction
|
### Persistent Interaction
|
||||||
|
|
||||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start a new chat
|
# Start a new chat
|
||||||
|
@ -893,41 +791,13 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
```
|
```
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
|
||||||
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
||||||
|
|
||||||
### Instruct mode
|
|
||||||
|
|
||||||
1. First, download and place the `ggml` model into the `./models` folder
|
|
||||||
2. Run the `main` tool like this:
|
|
||||||
|
|
||||||
```
|
|
||||||
./examples/alpaca.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
Sample run:
|
|
||||||
|
|
||||||
```
|
|
||||||
== Running in interactive mode. ==
|
|
||||||
- Press Ctrl+C to interject at any time.
|
|
||||||
- Press Return to return control to LLaMA.
|
|
||||||
- If you want to submit another line, end your input in '\'.
|
|
||||||
|
|
||||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
|
||||||
|
|
||||||
> How many letters are there in the English alphabet?
|
|
||||||
There 26 letters in the English Alphabet
|
|
||||||
> What is the most common way of transportation in Amsterdam?
|
|
||||||
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
|
||||||
> List 5 words that start with "ca".
|
|
||||||
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
|
||||||
>
|
|
||||||
```
|
|
||||||
|
|
||||||
### Obtaining and using the Facebook LLaMA 2 model
|
### Obtaining and using the Facebook LLaMA 2 model
|
||||||
|
|
||||||
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
|
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
|
||||||
|
@ -1000,7 +870,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
|
||||||
Now, you can start chatting:
|
Now, you can start chatting:
|
||||||
```
|
```
|
||||||
$cd /data/data/com.termux/files/home/bin
|
$cd /data/data/com.termux/files/home/bin
|
||||||
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's a demo of an interactive session running on Pixel 5 phone:
|
Here's a demo of an interactive session running on Pixel 5 phone:
|
||||||
|
@ -1067,8 +937,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
|
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
@ -1118,7 +988,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
|
||||||
|
|
||||||
### Docs
|
### Docs
|
||||||
|
|
||||||
- [main](./examples/main/README.md)
|
- [main (cli)](./examples/main/README.md)
|
||||||
- [server](./examples/server/README.md)
|
- [server](./examples/server/README.md)
|
||||||
- [jeopardy](./examples/jeopardy/README.md)
|
- [jeopardy](./examples/jeopardy/README.md)
|
||||||
- [BLIS](./docs/BLIS.md)
|
- [BLIS](./docs/BLIS.md)
|
||||||
|
|
172
build.zig
172
build.zig
|
@ -1,172 +0,0 @@
|
||||||
// Compatible with Zig Version 0.11.0
|
|
||||||
const std = @import("std");
|
|
||||||
const ArrayList = std.ArrayList;
|
|
||||||
const Compile = std.Build.Step.Compile;
|
|
||||||
const ConfigHeader = std.Build.Step.ConfigHeader;
|
|
||||||
const Mode = std.builtin.Mode;
|
|
||||||
const CrossTarget = std.zig.CrossTarget;
|
|
||||||
|
|
||||||
const Maker = struct {
|
|
||||||
builder: *std.build.Builder,
|
|
||||||
target: CrossTarget,
|
|
||||||
optimize: Mode,
|
|
||||||
enable_lto: bool,
|
|
||||||
|
|
||||||
include_dirs: ArrayList([]const u8),
|
|
||||||
cflags: ArrayList([]const u8),
|
|
||||||
cxxflags: ArrayList([]const u8),
|
|
||||||
objs: ArrayList(*Compile),
|
|
||||||
|
|
||||||
fn addInclude(m: *Maker, dir: []const u8) !void {
|
|
||||||
try m.include_dirs.append(dir);
|
|
||||||
}
|
|
||||||
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
|
|
||||||
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
|
|
||||||
}
|
|
||||||
fn addCFlag(m: *Maker, flag: []const u8) !void {
|
|
||||||
try m.cflags.append(flag);
|
|
||||||
}
|
|
||||||
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
|
|
||||||
try m.cxxflags.append(flag);
|
|
||||||
}
|
|
||||||
fn addFlag(m: *Maker, flag: []const u8) !void {
|
|
||||||
try m.addCFlag(flag);
|
|
||||||
try m.addCxxFlag(flag);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn init(builder: *std.build.Builder) !Maker {
|
|
||||||
const target = builder.standardTargetOptions(.{});
|
|
||||||
const zig_version = @import("builtin").zig_version_string;
|
|
||||||
const commit_hash = try std.ChildProcess.exec(
|
|
||||||
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
|
||||||
);
|
|
||||||
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
|
||||||
\\int LLAMA_BUILD_NUMBER = {};
|
|
||||||
\\char const *LLAMA_COMMIT = "{s}";
|
|
||||||
\\char const *LLAMA_COMPILER = "Zig {s}";
|
|
||||||
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
|
||||||
\\
|
|
||||||
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
|
||||||
var m = Maker{
|
|
||||||
.builder = builder,
|
|
||||||
.target = target,
|
|
||||||
.optimize = builder.standardOptimizeOption(.{}),
|
|
||||||
.enable_lto = false,
|
|
||||||
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
|
||||||
.cflags = ArrayList([]const u8).init(builder.allocator),
|
|
||||||
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
|
||||||
.objs = ArrayList(*Compile).init(builder.allocator),
|
|
||||||
};
|
|
||||||
|
|
||||||
try m.addCFlag("-std=c11");
|
|
||||||
try m.addCxxFlag("-std=c++11");
|
|
||||||
try m.addProjectInclude(&.{});
|
|
||||||
try m.addProjectInclude(&.{"common"});
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
|
||||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
|
||||||
if (o.target.getAbi() != .msvc)
|
|
||||||
o.defineCMacro("_GNU_SOURCE", null);
|
|
||||||
|
|
||||||
if (std.mem.endsWith(u8, src, ".c")) {
|
|
||||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
|
||||||
o.linkLibC();
|
|
||||||
} else {
|
|
||||||
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
|
||||||
if (o.target.getAbi() == .msvc) {
|
|
||||||
o.linkLibC(); // need winsdk + crt
|
|
||||||
} else {
|
|
||||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
|
||||||
o.linkLibCpp();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
|
||||||
o.want_lto = m.enable_lto;
|
|
||||||
return o;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
|
|
||||||
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
|
||||||
e.addCSourceFiles(&.{src}, m.cxxflags.items);
|
|
||||||
for (deps) |d| e.addObject(d);
|
|
||||||
for (m.objs.items) |o| e.addObject(o);
|
|
||||||
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
|
||||||
|
|
||||||
// https://github.com/ziglang/zig/issues/15448
|
|
||||||
if (e.target.getAbi() == .msvc) {
|
|
||||||
e.linkLibC(); // need winsdk + crt
|
|
||||||
} else {
|
|
||||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
|
||||||
e.linkLibCpp();
|
|
||||||
}
|
|
||||||
m.builder.installArtifact(e);
|
|
||||||
e.want_lto = m.enable_lto;
|
|
||||||
return e;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn build(b: *std.build.Builder) !void {
|
|
||||||
var make = try Maker.init(b);
|
|
||||||
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
|
||||||
|
|
||||||
const ggml = make.obj("ggml", "ggml.c");
|
|
||||||
const sgemm = make.obj("sgemm", "sgemm.cpp");
|
|
||||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
|
||||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
|
||||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
|
||||||
const unicode = make.obj("unicode", "unicode.cpp");
|
|
||||||
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
|
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
|
||||||
const buildinfo = make.obj("common", "common/build-info.cpp");
|
|
||||||
const common = make.obj("common", "common/common.cpp");
|
|
||||||
const console = make.obj("console", "common/console.cpp");
|
|
||||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
|
||||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
|
||||||
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
|
|
||||||
const train = make.obj("train", "common/train.cpp");
|
|
||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
|
||||||
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
|
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
|
|
||||||
if (server.target.isWindows()) {
|
|
||||||
server.linkSystemLibrary("ws2_32");
|
|
||||||
}
|
|
||||||
|
|
||||||
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
|
|
||||||
for (server_assets) |asset| {
|
|
||||||
const input_path = b.fmt("examples/server/public/{s}", .{asset});
|
|
||||||
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
|
|
||||||
|
|
||||||
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
|
|
||||||
|
|
||||||
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
|
|
||||||
defer b.allocator.free(input);
|
|
||||||
|
|
||||||
var buf = std.ArrayList(u8).init(b.allocator);
|
|
||||||
defer buf.deinit();
|
|
||||||
|
|
||||||
for (input) |byte| {
|
|
||||||
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
|
|
||||||
}
|
|
||||||
|
|
||||||
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
|
|
||||||
defer b.allocator.free(name);
|
|
||||||
std.mem.replaceScalar(u8, name, '.', '_');
|
|
||||||
|
|
||||||
try std.fs.cwd().writeFile(output_path, b.fmt(
|
|
||||||
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
|
|
||||||
.{ name, buf.items, name, input.len },
|
|
||||||
));
|
|
||||||
|
|
||||||
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
|
|
||||||
}
|
|
||||||
}
|
|
511
ci/run.sh
511
ci/run.sh
|
@ -36,11 +36,11 @@ SRC=`pwd`
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
|
@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||||
fi
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
|
@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_get_model {
|
function gg_get_model {
|
||||||
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
||||||
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
||||||
if [[ -s $gguf_3b ]]; then
|
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||||
echo -n "$gguf_3b"
|
if [[ -s $gguf_0 ]]; then
|
||||||
elif [[ -s $gguf_7b ]]; then
|
echo -n "$gguf_0"
|
||||||
echo -n "$gguf_7b"
|
elif [[ -s $gguf_1 ]]; then
|
||||||
|
echo -n "$gguf_1"
|
||||||
|
elif [[ -s $gguf_2 ]]; then
|
||||||
|
echo -n "$gguf_2"
|
||||||
else
|
else
|
||||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_3b_v2
|
|
||||||
|
|
||||||
function gg_run_open_llama_3b_v2 {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
|
||||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
|
||||||
|
|
||||||
path_models="../models-mnt/open-llama/3B-v2"
|
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../convert.py ${path_models}
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
|
||||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
|
||||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
|
||||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
|
||||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
|
||||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
|
||||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
|
||||||
|
|
||||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
|
||||||
qnt="$1"
|
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_open_llama_3b_v2 {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
||||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
|
||||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
|
||||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
|
||||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
|
||||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_7b_v2
|
||||||
# requires: GG_BUILD_CUDA
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
|
@ -414,10 +284,10 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert.py ${path_models}
|
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -433,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# pythia_1.4b
|
||||||
|
|
||||||
|
function gg_run_pythia_1_4b {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
|
||||||
|
|
||||||
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||||
|
|
||||||
|
path_models="../models-mnt/pythia/1.4B"
|
||||||
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||||
|
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||||
|
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||||
|
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||||
|
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||||
|
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
|
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
|
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
|
function check_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_pythia_1_4b {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Pythia 1.4B:\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||||
|
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||||
|
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||||
|
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||||
|
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||||
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# pythia_2_8b
|
||||||
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
|
function gg_run_pythia_2_8b {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
|
||||||
|
|
||||||
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
|
||||||
|
path_models="../models-mnt/pythia/2.8B"
|
||||||
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||||
|
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||||
|
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||||
|
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||||
|
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||||
|
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
|
function check_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_pythia_2_8b {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Pythia 2.8B:\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||||
|
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||||
|
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||||
|
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||||
|
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||||
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
}
|
||||||
|
|
||||||
# bge-small
|
# bge-small
|
||||||
|
|
||||||
function gg_run_embd_bge_small {
|
function gg_run_embd_bge_small {
|
||||||
|
@ -552,15 +688,15 @@ function gg_run_embd_bge_small {
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert-hf-to-gguf.py ${path_models}
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -606,9 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
test $ret -eq 0 && gg_run pythia_1_4b
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
test $ret -eq 0 && gg_run pythia_2_8b
|
||||||
|
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||||
fi
|
fi
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||||
|
|
|
@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
|
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||||
|
|
22
cmake/git-vars.cmake
Normal file
22
cmake/git-vars.cmake
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
find_package(Git)
|
||||||
|
|
||||||
|
# the commit's SHA1
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_SHA1
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
# the date of the commit
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_DATE
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
# the subject of the commit
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" log -1 --format=%s
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
@ -2,13 +2,12 @@ set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
||||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
set(LLAMA_BLAS @LLAMA_BLAS@)
|
|
||||||
set(LLAMA_CUDA @LLAMA_CUDA@)
|
set(GGML_BLAS @GGML_BLAS@)
|
||||||
set(LLAMA_METAL @LLAMA_METAL@)
|
set(GGML_CUDA @GGML_CUDA@)
|
||||||
set(LLAMA_MPI @LLAMA_MPI@)
|
set(GGML_METAL @GGML_METAL@)
|
||||||
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||||
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||||
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
|
||||||
|
|
||||||
@PACKAGE_INIT@
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
|
@ -19,33 +18,26 @@ set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||||
# Ensure transient dependencies satisfied
|
# Ensure transient dependencies satisfied
|
||||||
|
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
if (APPLE AND LLAMA_ACCELERATE)
|
|
||||||
|
if (APPLE AND GGML_ACCELERATE)
|
||||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BLAS)
|
if (GGML_BLAS)
|
||||||
find_package(BLAS REQUIRED)
|
find_package(BLAS REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUDA)
|
if (GGML_CUDA)
|
||||||
find_package(CUDAToolkit REQUIRED)
|
find_package(CUDAToolkit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_METAL)
|
if (GGML_METAL)
|
||||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_MPI)
|
if (GGML_HIPBLAS)
|
||||||
find_package(MPI REQUIRED)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_CLBLAST)
|
|
||||||
find_package(CLBlast REQUIRED)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_HIPBLAS)
|
|
||||||
find_package(hip REQUIRED)
|
find_package(hip REQUIRED)
|
||||||
find_package(hipblas REQUIRED)
|
find_package(hipblas REQUIRED)
|
||||||
find_package(rocblas REQUIRED)
|
find_package(rocblas REQUIRED)
|
||||||
|
@ -57,7 +49,9 @@ find_library(llama_LIBRARY llama
|
||||||
|
|
||||||
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
||||||
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
|
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
|
||||||
|
|
||||||
add_library(llama UNKNOWN IMPORTED)
|
add_library(llama UNKNOWN IMPORTED)
|
||||||
|
|
||||||
set_target_properties(llama
|
set_target_properties(llama
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
10
cmake/llama.pc.in
Normal file
10
cmake/llama.pc.in
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
prefix=@CMAKE_INSTALL_PREFIX@
|
||||||
|
exec_prefix=${prefix}
|
||||||
|
libdir=${exec_prefix}/lib
|
||||||
|
includedir=${prefix}/include
|
||||||
|
|
||||||
|
Name: llama
|
||||||
|
Description: Port of Facebook's LLaMA model in C/C++
|
||||||
|
Version: @PROJECT_VERSION@
|
||||||
|
Libs: -L${libdir} -lllama
|
||||||
|
Cflags: -I${includedir}
|
14
codecov.yml
14
codecov.yml
|
@ -1,14 +0,0 @@
|
||||||
comment: off
|
|
||||||
|
|
||||||
coverage:
|
|
||||||
status:
|
|
||||||
project:
|
|
||||||
default:
|
|
||||||
target: auto
|
|
||||||
threshold: 0
|
|
||||||
base: auto
|
|
||||||
patch:
|
|
||||||
default:
|
|
||||||
target: auto
|
|
||||||
threshold: 0
|
|
||||||
base: auto
|
|
|
@ -1,5 +1,6 @@
|
||||||
# common
|
# common
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
# Build info header
|
# Build info header
|
||||||
#
|
#
|
||||||
|
@ -36,7 +37,7 @@ add_custom_command(
|
||||||
COMMENT "Generating build details from Git"
|
COMMENT "Generating build details from Git"
|
||||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
|
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
|
@ -84,4 +85,4 @@ endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||||
|
|
||||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
4136
common/common.cpp
4136
common/common.cpp
File diff suppressed because it is too large
Load diff
215
common/common.h
215
common/common.h
|
@ -41,22 +41,32 @@ extern char const *LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
struct llama_control_vector_load_info;
|
struct llama_control_vector_load_info;
|
||||||
|
|
||||||
int get_math_cpu_count();
|
//
|
||||||
int32_t get_num_physical_cores();
|
// CPU utils
|
||||||
|
//
|
||||||
|
|
||||||
|
int32_t cpu_get_num_physical_cores();
|
||||||
|
int32_t cpu_get_num_math();
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
enum dimre_method {
|
||||||
|
DIMRE_METHOD_PCA,
|
||||||
|
DIMRE_METHOD_MEAN,
|
||||||
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = get_math_cpu_count();
|
int32_t n_threads = cpu_get_num_math();
|
||||||
int32_t n_threads_draft = -1;
|
int32_t n_threads_draft = -1;
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_threads_batch_draft = -1;
|
int32_t n_threads_batch_draft = -1;
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
@ -67,10 +77,8 @@ struct gpt_params {
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
|
||||||
int32_t grp_attn_n = 1; // group-attention factor
|
int32_t grp_attn_n = 1; // group-attention factor
|
||||||
int32_t grp_attn_w = 512; // group-attention width
|
int32_t grp_attn_w = 512; // group-attention width
|
||||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||||
|
@ -82,13 +90,13 @@ struct gpt_params {
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
|
|
||||||
|
@ -106,12 +114,14 @@ struct gpt_params {
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with
|
std::string input_suffix = ""; // string to suffix user inputs with
|
||||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
|
||||||
std::string logdir = ""; // directory in which to save YAML log files
|
std::string logdir = ""; // directory in which to save YAML log files
|
||||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||||
std::string logits_file = ""; // file for saving *all* logits
|
std::string logits_file = ""; // file for saving *all* logits
|
||||||
|
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||||
|
|
||||||
|
std::vector<std::string> in_files; // all input files
|
||||||
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
std::vector<llama_model_kv_override> kv_overrides;
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
// TODO: avoid tuple, use struct
|
// TODO: avoid tuple, use struct
|
||||||
|
@ -120,11 +130,12 @@ struct gpt_params {
|
||||||
|
|
||||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
|
int32_t verbosity = 0;
|
||||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||||
|
|
||||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
// (which is more convenient to use for plotting)
|
// (which is more convenient to use for plotting)
|
||||||
//
|
//
|
||||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||||
|
@ -138,18 +149,16 @@ struct gpt_params {
|
||||||
|
|
||||||
bool kl_divergence = false; // compute KL divergence
|
bool kl_divergence = false; // compute KL divergence
|
||||||
|
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool usage = false; // print usage
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
|
bool special = false; // enable special token output
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
bool interactive_first = false; // wait for user input immediately
|
||||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
|
||||||
bool interactive_first = false; // wait for user input immediately
|
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
|
@ -157,7 +166,6 @@ struct gpt_params {
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool ignore_eos = false; // ignore generated EOS tokens
|
bool ignore_eos = false; // ignore generated EOS tokens
|
||||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
|
@ -175,37 +183,122 @@ struct gpt_params {
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// embedding
|
||||||
|
bool embedding = false; // get only sentence embedding
|
||||||
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
|
|
||||||
|
// server params
|
||||||
|
int32_t port = 8080; // server listens on this network port
|
||||||
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
||||||
|
|
||||||
|
std::string hostname = "127.0.0.1";
|
||||||
|
std::string public_path = "";
|
||||||
|
std::string chat_template = "";
|
||||||
|
std::string system_prompt = "";
|
||||||
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
|
std::string ssl_file_key = "";
|
||||||
|
std::string ssl_file_cert = "";
|
||||||
|
|
||||||
|
bool endpoint_slots = true;
|
||||||
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
|
bool log_json = false;
|
||||||
|
|
||||||
|
std::string slot_save_path;
|
||||||
|
|
||||||
|
float slot_prompt_similarity = 0.5f;
|
||||||
|
|
||||||
|
// batched-bench params
|
||||||
|
bool is_pp_shared = false;
|
||||||
|
|
||||||
|
std::vector<int32_t> n_pp;
|
||||||
|
std::vector<int32_t> n_tg;
|
||||||
|
std::vector<int32_t> n_pl;
|
||||||
|
|
||||||
|
// retrieval params
|
||||||
|
std::vector<std::string> context_files; // context files to embed
|
||||||
|
|
||||||
|
int32_t chunk_size = 64; // chunk size for context embedding
|
||||||
|
|
||||||
|
std::string chunk_separator = "\n"; // chunk separator for context embedding
|
||||||
|
|
||||||
|
// passkey params
|
||||||
|
int32_t n_junk = 250; // number of times to repeat the junk text
|
||||||
|
int32_t i_pos = -1; // position of the passkey in the junk text
|
||||||
|
|
||||||
|
// imatrix params
|
||||||
|
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
||||||
|
|
||||||
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||||
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||||
|
int32_t i_chunk = 0; // start processing from this chunk
|
||||||
|
|
||||||
|
bool process_output = false; // collect data for the output tensor
|
||||||
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
||||||
|
// cvector-generator params
|
||||||
|
int n_pca_batch = 100;
|
||||||
|
int n_pca_iterations = 1000;
|
||||||
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
|
|
||||||
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
|
|
||||||
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
||||||
|
|
||||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||||
|
|
||||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
||||||
|
|
||||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
||||||
|
|
||||||
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
||||||
|
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
|
|
||||||
std::string get_system_info(const gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
|
||||||
|
|
||||||
void process_escapes(std::string& input);
|
|
||||||
|
|
||||||
bool validate_file_name(const std::string & filename);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
||||||
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
|
|
||||||
std::vector<std::string> string_split(std::string input, char separator);
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
|
||||||
std::string string_strip(const std::string & str);
|
std::string string_strip(const std::string & str);
|
||||||
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
std::string string_get_sortable_timestamp();
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||||
|
std::vector<T> values;
|
||||||
|
std::istringstream str_stream(str);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(str_stream, token, delim)) {
|
||||||
|
T value;
|
||||||
|
std::istringstream token_stream(token);
|
||||||
|
token_stream >> value;
|
||||||
|
values.push_back(value);
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Filesystem utils
|
||||||
|
//
|
||||||
|
|
||||||
|
bool fs_validate_filename(const std::string & filename);
|
||||||
|
bool fs_create_directory_with_parents(const std::string & path);
|
||||||
|
|
||||||
|
std::string fs_get_cache_directory();
|
||||||
|
std::string fs_get_cache_file(const std::string & filename);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
@ -277,34 +370,52 @@ std::string llama_detokenize_bpe(
|
||||||
bool llama_should_add_bos_token(const llama_model * model);
|
bool llama_should_add_bos_token(const llama_model * model);
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
bool create_directory_with_parents(const std::string & path);
|
// same with llama_chat_message, but uses std::string
|
||||||
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
|
struct llama_chat_msg {
|
||||||
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
|
std::string role;
|
||||||
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
|
std::string content;
|
||||||
std::string get_sortable_timestamp();
|
};
|
||||||
|
|
||||||
void dump_non_result_info_yaml(
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
bool llama_chat_verify_template(const std::string & tmpl);
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
||||||
|
// CPP wrapper for llama_chat_apply_template
|
||||||
|
// If the built-in template is not supported, we default to chatml
|
||||||
|
// If the custom "tmpl" is not supported, we throw an error
|
||||||
|
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & chat,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Format single message, while taking into account the position of that message in chat history
|
||||||
|
std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & past_msg,
|
||||||
|
const llama_chat_msg & new_msg,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Returns an example of formatted chat
|
||||||
|
std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
|
const std::string & tmpl);
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// Dump the KV cache view with the number of sequences per cell.
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
|
@ -332,6 +443,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
||||||
//
|
//
|
||||||
// Split utils
|
// Split utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
|
//
|
||||||
|
// YAML utils
|
||||||
|
//
|
||||||
|
|
||||||
|
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
||||||
|
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
||||||
|
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||||
|
|
||||||
|
void yaml_dump_non_result_info(
|
||||||
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
||||||
|
|
|
@ -46,8 +46,12 @@ namespace grammar_parser {
|
||||||
state.rules[rule_id] = rule;
|
state.rules[rule_id] = rule;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_digit_char(char c) {
|
||||||
|
return '0' <= c && c <= '9';
|
||||||
|
}
|
||||||
|
|
||||||
static bool is_word_char(char c) {
|
static bool is_word_char(char c) {
|
||||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
|
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||||
|
@ -99,6 +103,17 @@ namespace grammar_parser {
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * parse_int(const char * src) {
|
||||||
|
const char * pos = src;
|
||||||
|
while (is_digit_char(*pos)) {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
if (pos == src) {
|
||||||
|
throw std::runtime_error(std::string("expecting integer at ") + src);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||||
if (*src == '\\') {
|
if (*src == '\\') {
|
||||||
switch (src[1]) {
|
switch (src[1]) {
|
||||||
|
@ -137,6 +152,60 @@ namespace grammar_parser {
|
||||||
bool is_nested) {
|
bool is_nested) {
|
||||||
size_t last_sym_start = out_elements.size();
|
size_t last_sym_start = out_elements.size();
|
||||||
const char * pos = src;
|
const char * pos = src;
|
||||||
|
|
||||||
|
auto handle_repetitions = [&](int min_times, int max_times) {
|
||||||
|
|
||||||
|
if (last_sym_start == out_elements.size()) {
|
||||||
|
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||||
|
// the following rewrite rules:
|
||||||
|
// S{m,n} --> S S S (m times) S'(n-m)
|
||||||
|
// S'(x) ::= S S'(x-1) |
|
||||||
|
// (... n-m definitions of these S' rules ...)
|
||||||
|
// S'(1) ::= S |
|
||||||
|
// S{m,} --> S S S (m times) S'
|
||||||
|
// S' ::= S S' |
|
||||||
|
// S* --> S{0,}
|
||||||
|
// --> S' ::= S S' |
|
||||||
|
// S+ --> S{1,}
|
||||||
|
// --> S S'
|
||||||
|
// S' ::= S S' |
|
||||||
|
// S? --> S{0,1}
|
||||||
|
// --> S'
|
||||||
|
// S' ::= S |
|
||||||
|
|
||||||
|
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
|
||||||
|
if (min_times == 0) {
|
||||||
|
out_elements.resize(last_sym_start);
|
||||||
|
} else {
|
||||||
|
// Repeat the previous elements (min_times - 1) times
|
||||||
|
for (int i = 1; i < min_times; i++) {
|
||||||
|
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t last_rec_rule_id = 0;
|
||||||
|
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
||||||
|
|
||||||
|
std::vector<llama_grammar_element> rec_rule(previous_elements);
|
||||||
|
for (int i = 0; i < n_opt; i++) {
|
||||||
|
rec_rule.resize(previous_elements.size());
|
||||||
|
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
|
||||||
|
if (i > 0 || max_times < 0) {
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
||||||
|
}
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||||
|
add_rule(state, rec_rule_id, rec_rule);
|
||||||
|
last_rec_rule_id = rec_rule_id;
|
||||||
|
}
|
||||||
|
if (n_opt > 0) {
|
||||||
|
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
while (*pos) {
|
while (*pos) {
|
||||||
if (*pos == '"') { // literal string
|
if (*pos == '"') { // literal string
|
||||||
pos++;
|
pos++;
|
||||||
|
@ -197,40 +266,51 @@ namespace grammar_parser {
|
||||||
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
||||||
}
|
}
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
|
} else if (*pos == '.') { // any char
|
||||||
if (last_sym_start == out_elements.size()) {
|
last_sym_start = out_elements.size();
|
||||||
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
|
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
||||||
}
|
|
||||||
|
|
||||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
|
||||||
// rewrite rules:
|
|
||||||
// S* --> S' ::= S S' |
|
|
||||||
// S+ --> S' ::= S S' | S
|
|
||||||
// S? --> S' ::= S |
|
|
||||||
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
|
||||||
std::vector<llama_grammar_element> sub_rule;
|
|
||||||
// add preceding symbol to generated rule
|
|
||||||
sub_rule.insert(
|
|
||||||
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
|
|
||||||
if (*pos == '*' || *pos == '+') {
|
|
||||||
// cause generated rule to recurse
|
|
||||||
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
|
||||||
}
|
|
||||||
// mark start of alternate def
|
|
||||||
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
|
||||||
if (*pos == '+') {
|
|
||||||
// add preceding symbol as alternate only for '+' (otherwise empty)
|
|
||||||
sub_rule.insert(
|
|
||||||
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
|
|
||||||
}
|
|
||||||
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
||||||
add_rule(state, sub_rule_id, sub_rule);
|
|
||||||
|
|
||||||
// in original rule, replace previous symbol with reference to generated rule
|
|
||||||
out_elements.resize(last_sym_start);
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
|
||||||
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == '*') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(0, -1);
|
||||||
|
} else if (*pos == '+') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(1, -1);
|
||||||
|
} else if (*pos == '?') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(0, 1);
|
||||||
|
} else if (*pos == '{') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
|
||||||
|
if (!is_digit_char(*pos)) {
|
||||||
|
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||||
|
}
|
||||||
|
const char * int_end = parse_int(pos);
|
||||||
|
int min_times = std::stoul(std::string(pos, int_end - pos));
|
||||||
|
pos = parse_space(int_end, is_nested);
|
||||||
|
|
||||||
|
int max_times = -1;
|
||||||
|
|
||||||
|
if (*pos == '}') {
|
||||||
|
max_times = min_times;
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == ',') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
|
||||||
|
if (is_digit_char(*pos)) {
|
||||||
|
const char * int_end = parse_int(pos);
|
||||||
|
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||||
|
pos = parse_space(int_end, is_nested);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*pos != '}') {
|
||||||
|
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
||||||
|
}
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
||||||
|
}
|
||||||
|
handle_repetitions(min_times, max_times);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -325,6 +405,7 @@ namespace grammar_parser {
|
||||||
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY: return true;
|
||||||
default: return false;
|
default: return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -339,6 +420,7 @@ namespace grammar_parser {
|
||||||
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
||||||
}
|
}
|
||||||
switch (elem.type) {
|
switch (elem.type) {
|
||||||
case LLAMA_GRETYPE_END:
|
case LLAMA_GRETYPE_END:
|
||||||
|
@ -350,6 +432,7 @@ namespace grammar_parser {
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
case LLAMA_GRETYPE_CHAR_NOT:
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
case LLAMA_GRETYPE_CHAR_ALT:
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
fprintf(file, "(\"");
|
fprintf(file, "(\"");
|
||||||
print_grammar_char(file, elem.value);
|
print_grammar_char(file, elem.value);
|
||||||
fprintf(file, "\") ");
|
fprintf(file, "\") ");
|
||||||
|
@ -407,11 +490,15 @@ namespace grammar_parser {
|
||||||
}
|
}
|
||||||
print_grammar_char(file, elem.value);
|
print_grammar_char(file, elem.value);
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
|
fprintf(file, ".");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (is_char_element(elem)) {
|
if (is_char_element(elem)) {
|
||||||
switch (rule[i + 1].type) {
|
switch (rule[i + 1].type) {
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
case LLAMA_GRETYPE_CHAR_ALT:
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
fprintf(file, "] ");
|
fprintf(file, "] ");
|
||||||
|
|
|
@ -16,92 +16,282 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
|
||||||
|
|
||||||
static std::string repeat(const std::string & str, size_t n);
|
static std::string repeat(const std::string & str, size_t n);
|
||||||
|
|
||||||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||||
if (separator_rule.empty()) {
|
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||||
|
|
||||||
if (min_items == 0 && max_items == 1) {
|
if (min_items == 0 && max_items == 1) {
|
||||||
return item_rule + "?";
|
return item_rule + "?";
|
||||||
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
|
}
|
||||||
|
|
||||||
|
if (separator_rule.empty()) {
|
||||||
|
if (min_items == 1 && !has_max) {
|
||||||
return item_rule + "+";
|
return item_rule + "+";
|
||||||
}
|
} else if (min_items == 0 && !has_max) {
|
||||||
}
|
return item_rule + "*";
|
||||||
|
|
||||||
std::string result;
|
|
||||||
if (min_items > 0) {
|
|
||||||
if (item_rule_is_literal && separator_rule.empty()) {
|
|
||||||
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
|
|
||||||
} else {
|
} else {
|
||||||
std::vector<std::string> items(min_items, item_rule);
|
return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
|
||||||
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
|
auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
|
||||||
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
|
if (min_items == 0) {
|
||||||
|
result = "(" + result + ")?";
|
||||||
if (up_to_n == 0) {
|
|
||||||
return "";
|
|
||||||
} else if (up_to_n == 1) {
|
|
||||||
return "(" + content + ")?";
|
|
||||||
} else if (!separator_rule.empty() && !prefix_with_sep) {
|
|
||||||
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
|
|
||||||
} else {
|
|
||||||
std::string res = repeat("(" + content + " ", up_to_n);
|
|
||||||
// strip trailing space
|
|
||||||
res = res.substr(0, res.length() - 1);
|
|
||||||
res += repeat(")?", up_to_n);
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
if (min_items > 0 && max_items != min_items) {
|
|
||||||
result += " ";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (max_items != std::numeric_limits<int>::max()) {
|
|
||||||
result += opt_repetitions(max_items - min_items, min_items > 0);
|
|
||||||
} else {
|
|
||||||
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
|
|
||||||
if (min_items == 0 && !separator_rule.empty()) {
|
|
||||||
result = "(" + item_rule + " " + item_operator + "*)?";
|
|
||||||
} else {
|
|
||||||
result += item_operator + "*";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string SPACE_RULE = "\" \"?";
|
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
||||||
|
class string_view {
|
||||||
|
const std::string & _str;
|
||||||
|
const size_t _start;
|
||||||
|
const size_t _end;
|
||||||
|
public:
|
||||||
|
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
||||||
|
|
||||||
|
size_t size() const {
|
||||||
|
return _end - _start;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t length() const {
|
||||||
|
return size();
|
||||||
|
}
|
||||||
|
|
||||||
|
operator std::string() const {
|
||||||
|
return str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string str() const {
|
||||||
|
return _str.substr(_start, _end - _start);
|
||||||
|
}
|
||||||
|
|
||||||
|
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
||||||
|
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
||||||
|
}
|
||||||
|
|
||||||
|
char operator[](size_t pos) const {
|
||||||
|
auto index = _start + pos;
|
||||||
|
if (index >= _end) {
|
||||||
|
throw std::out_of_range("string_view index out of range");
|
||||||
|
}
|
||||||
|
return _str[_start + pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator==(const string_view & other) const {
|
||||||
|
std::string this_str = *this;
|
||||||
|
std::string other_str = other;
|
||||||
|
return this_str == other_str;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
||||||
|
auto has_min = min_value != std::numeric_limits<int>::min();
|
||||||
|
auto has_max = max_value != std::numeric_limits<int>::max();
|
||||||
|
|
||||||
|
auto digit_range = [&](char from, char to) {
|
||||||
|
out << "[";
|
||||||
|
if (from == to) {
|
||||||
|
out << from;
|
||||||
|
} else {
|
||||||
|
out << from << "-" << to;
|
||||||
|
}
|
||||||
|
out << "]";
|
||||||
|
};
|
||||||
|
auto more_digits = [&](int min_digits, int max_digits) {
|
||||||
|
out << "[0-9]";
|
||||||
|
if (min_digits == max_digits && min_digits == 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
out << "{";
|
||||||
|
out << min_digits;
|
||||||
|
if (max_digits != min_digits) {
|
||||||
|
out << ",";
|
||||||
|
if (max_digits != std::numeric_limits<int>::max()) {
|
||||||
|
out << max_digits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out << "}";
|
||||||
|
};
|
||||||
|
std::function<void(const string_view &, const string_view &)> uniform_range =
|
||||||
|
[&](const string_view & from, const string_view & to) {
|
||||||
|
size_t i = 0;
|
||||||
|
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (i > 0) {
|
||||||
|
out << "\"" << from.substr(0, i).str() << "\"";
|
||||||
|
}
|
||||||
|
if (i < from.length() && i < to.length()) {
|
||||||
|
if (i > 0) {
|
||||||
|
out << " ";
|
||||||
|
}
|
||||||
|
auto sub_len = from.length() - i - 1;
|
||||||
|
if (sub_len > 0) {
|
||||||
|
auto from_sub = from.substr(i + 1);
|
||||||
|
auto to_sub = to.substr(i + 1);
|
||||||
|
auto sub_zeros = repeat("0", sub_len);
|
||||||
|
auto sub_nines = repeat("9", sub_len);
|
||||||
|
|
||||||
|
auto to_reached = false;
|
||||||
|
out << "(";
|
||||||
|
if (from_sub == sub_zeros) {
|
||||||
|
digit_range(from[i], to[i] - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(sub_len, sub_len);
|
||||||
|
} else {
|
||||||
|
out << "[" << from[i] << "] ";
|
||||||
|
out << "(";
|
||||||
|
uniform_range(from_sub, sub_nines);
|
||||||
|
out << ")";
|
||||||
|
if (from[i] < to[i] - 1) {
|
||||||
|
out << " | ";
|
||||||
|
if (to_sub == sub_nines) {
|
||||||
|
digit_range(from[i] + 1, to[i]);
|
||||||
|
to_reached = true;
|
||||||
|
} else {
|
||||||
|
digit_range(from[i] + 1, to[i] - 1);
|
||||||
|
}
|
||||||
|
out << " ";
|
||||||
|
more_digits(sub_len, sub_len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!to_reached) {
|
||||||
|
out << " | ";
|
||||||
|
digit_range(to[i], to[i]);
|
||||||
|
out << " ";
|
||||||
|
uniform_range(sub_zeros, to_sub);
|
||||||
|
}
|
||||||
|
out << ")";
|
||||||
|
} else {
|
||||||
|
out << "[" << from[i] << "-" << to[i] << "]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (has_min && has_max) {
|
||||||
|
if (min_value < 0 && max_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
out << ")";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (min_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
out << ") | ";
|
||||||
|
min_value = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto min_s = std::to_string(min_value);
|
||||||
|
auto max_s = std::to_string(max_value);
|
||||||
|
auto min_digits = min_s.length();
|
||||||
|
auto max_digits = max_s.length();
|
||||||
|
|
||||||
|
for (auto digits = min_digits; digits < max_digits; digits++) {
|
||||||
|
uniform_range(min_s, repeat("9", digits));
|
||||||
|
min_s = "1" + repeat("0", digits);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
uniform_range(min_s, max_s);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto less_decimals = std::max(decimals_left - 1, 1);
|
||||||
|
|
||||||
|
if (has_min) {
|
||||||
|
if (min_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
||||||
|
out << ") | [0] | [1-9] ";
|
||||||
|
more_digits(0, decimals_left - 1);
|
||||||
|
} else if (min_value == 0) {
|
||||||
|
if (top_level) {
|
||||||
|
out << "[0] | [1-9] ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
} else {
|
||||||
|
more_digits(1, decimals_left);
|
||||||
|
}
|
||||||
|
} else if (min_value <= 9) {
|
||||||
|
char c = '0' + min_value;
|
||||||
|
auto range_start = top_level ? '1' : '0';
|
||||||
|
if (c > range_start) {
|
||||||
|
digit_range(range_start, c - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(1, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
digit_range(c, '9');
|
||||||
|
out << " ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
} else {
|
||||||
|
auto min_s = std::to_string(min_value);
|
||||||
|
auto len = min_s.length();
|
||||||
|
auto c = min_s[0];
|
||||||
|
|
||||||
|
if (c > '1') {
|
||||||
|
digit_range(top_level ? '1' : '0', c - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(len, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
digit_range(c, c);
|
||||||
|
out << " (";
|
||||||
|
_build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
|
||||||
|
out << ")";
|
||||||
|
if (c < '9') {
|
||||||
|
out << " | ";
|
||||||
|
digit_range(c + 1, '9');
|
||||||
|
out << " ";
|
||||||
|
more_digits(len - 1, less_decimals);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (has_max) {
|
||||||
|
if (max_value >= 0) {
|
||||||
|
if (top_level) {
|
||||||
|
out << "\"-\" [1-9] ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
} else {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
|
||||||
|
out << ")";
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw std::runtime_error("At least one of min_value or max_value must be set");
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||||
|
|
||||||
struct BuiltinRule {
|
struct BuiltinRule {
|
||||||
std::string content;
|
std::string content;
|
||||||
std::vector<std::string> deps;
|
std::vector<std::string> deps;
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
|
|
||||||
|
|
||||||
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
||||||
{"boolean", {"(\"true\" | \"false\") space", {}}},
|
{"boolean", {"(\"true\" | \"false\") space", {}}},
|
||||||
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
|
{"decimal-part", {"[0-9]{1,16}", {}}},
|
||||||
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
|
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
|
||||||
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
|
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
|
||||||
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
|
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
|
||||||
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
|
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
|
||||||
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
||||||
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
||||||
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
|
|
||||||
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
|
|
||||||
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
||||||
{"null", {"\"null\" space", {}}},
|
{"null", {"\"null\" space", {}}},
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
||||||
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
|
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
|
||||||
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
|
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
|
||||||
{"date-time", {"date \"T\" time", {"date", "time"}}},
|
{"date-time", {"date \"T\" time", {"date", "time"}}},
|
||||||
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
|
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
|
||||||
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
|
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
|
||||||
|
@ -126,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||||
|
|
||||||
template <typename Iterator>
|
template <typename Iterator>
|
||||||
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
||||||
|
@ -197,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
|
||||||
return "\"" + escaped + "\"";
|
return "\"" + escaped + "\"";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class SchemaConverter {
|
class SchemaConverter {
|
||||||
private:
|
private:
|
||||||
std::function<json(const std::string &)> _fetch_json;
|
std::function<json(const std::string &)> _fetch_json;
|
||||||
|
@ -385,8 +574,7 @@ private:
|
||||||
sub_is_literal ? "\"" + sub + "\"" : sub,
|
sub_is_literal ? "\"" + sub + "\"" : sub,
|
||||||
min_times,
|
min_times,
|
||||||
max_times,
|
max_times,
|
||||||
"",
|
""
|
||||||
sub_is_literal
|
|
||||||
);
|
);
|
||||||
seq.back().second = false;
|
seq.back().second = false;
|
||||||
} else {
|
} else {
|
||||||
|
@ -426,6 +614,75 @@ private:
|
||||||
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Returns a rule that matches a JSON string that is none of the provided strings
|
||||||
|
|
||||||
|
not_strings({"a"})
|
||||||
|
-> ["] ( [a] char+ | [^"a] char* )? ["] space
|
||||||
|
not_strings({"and", "also"})
|
||||||
|
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
|
||||||
|
*/
|
||||||
|
std::string _not_strings(const std::vector<std::string> & strings) {
|
||||||
|
|
||||||
|
struct TrieNode {
|
||||||
|
std::map<char, TrieNode> children;
|
||||||
|
bool is_end_of_string;
|
||||||
|
|
||||||
|
TrieNode() : is_end_of_string(false) {}
|
||||||
|
|
||||||
|
void insert(const std::string & string) {
|
||||||
|
auto node = this;
|
||||||
|
for (char c : string) {
|
||||||
|
node = &node->children[c];
|
||||||
|
}
|
||||||
|
node->is_end_of_string = true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
TrieNode trie;
|
||||||
|
for (const auto & s : strings) {
|
||||||
|
trie.insert(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "[\"] ( ";
|
||||||
|
std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
|
||||||
|
std::ostringstream rejects;
|
||||||
|
auto first = true;
|
||||||
|
for (const auto & kv : node.children) {
|
||||||
|
rejects << kv.first;
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
out << "[" << kv.first << "]";
|
||||||
|
if (!kv.second.children.empty()) {
|
||||||
|
out << " (";
|
||||||
|
visit(kv.second);
|
||||||
|
out << ")";
|
||||||
|
} else if (kv.second.is_end_of_string) {
|
||||||
|
out << " " << char_rule << "+";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!node.children.empty()) {
|
||||||
|
if (!first) {
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
out << "[^\"" << rejects.str() << "] " << char_rule << "*";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
visit(trie);
|
||||||
|
|
||||||
|
out << " )";
|
||||||
|
if (!trie.is_end_of_string) {
|
||||||
|
out << "?";
|
||||||
|
}
|
||||||
|
out << " [\"] space";
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
std::string _resolve_ref(const std::string & ref) {
|
std::string _resolve_ref(const std::string & ref) {
|
||||||
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
|
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
|
||||||
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
||||||
|
@ -446,6 +703,7 @@ private:
|
||||||
std::vector<std::string> required_props;
|
std::vector<std::string> required_props;
|
||||||
std::vector<std::string> optional_props;
|
std::vector<std::string> optional_props;
|
||||||
std::unordered_map<std::string, std::string> prop_kv_rule_names;
|
std::unordered_map<std::string, std::string> prop_kv_rule_names;
|
||||||
|
std::vector<std::string> prop_names;
|
||||||
for (const auto & kv : properties) {
|
for (const auto & kv : properties) {
|
||||||
const auto &prop_name = kv.first;
|
const auto &prop_name = kv.first;
|
||||||
const auto &prop_schema = kv.second;
|
const auto &prop_schema = kv.second;
|
||||||
|
@ -460,11 +718,18 @@ private:
|
||||||
} else {
|
} else {
|
||||||
optional_props.push_back(prop_name);
|
optional_props.push_back(prop_name);
|
||||||
}
|
}
|
||||||
|
prop_names.push_back(prop_name);
|
||||||
}
|
}
|
||||||
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
|
if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
|
||||||
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
||||||
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
|
std::string value_rule =
|
||||||
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
|
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
|
||||||
|
: _add_primitive("value", PRIMITIVE_RULES.at("value"));
|
||||||
|
|
||||||
|
auto key_rule =
|
||||||
|
prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
|
||||||
|
: _add_rule(sub_name + "-k", _not_strings(prop_names));
|
||||||
|
std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
|
||||||
prop_kv_rule_names["*"] = kv_rule;
|
prop_kv_rule_names["*"] = kv_rule;
|
||||||
optional_props.push_back("*");
|
optional_props.push_back("*");
|
||||||
}
|
}
|
||||||
|
@ -490,15 +755,11 @@ private:
|
||||||
}
|
}
|
||||||
std::string k = ks[0];
|
std::string k = ks[0];
|
||||||
std::string kv_rule_name = prop_kv_rule_names[k];
|
std::string kv_rule_name = prop_kv_rule_names[k];
|
||||||
if (k == "*") {
|
std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
|
||||||
res = _add_rule(
|
if (first_is_optional) {
|
||||||
name + (name.empty() ? "" : "-") + "additional-kvs",
|
res = comma_ref + (k == "*" ? "*" : "?");
|
||||||
kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
|
|
||||||
);
|
|
||||||
} else if (first_is_optional) {
|
|
||||||
res = "( \",\" space " + kv_rule_name + " )?";
|
|
||||||
} else {
|
} else {
|
||||||
res = kv_rule_name;
|
res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
|
||||||
}
|
}
|
||||||
if (ks.size() > 1) {
|
if (ks.size() > 1) {
|
||||||
res += " " + _add_rule(
|
res += " " + _add_rule(
|
||||||
|
@ -632,17 +893,19 @@ public:
|
||||||
} else if (schema_type.is_array()) {
|
} else if (schema_type.is_array()) {
|
||||||
std::vector<json> schema_types;
|
std::vector<json> schema_types;
|
||||||
for (const auto & t : schema_type) {
|
for (const auto & t : schema_type) {
|
||||||
schema_types.push_back({{"type", t}});
|
json schema_copy(schema);
|
||||||
|
schema_copy["type"] = t;
|
||||||
|
schema_types.push_back(schema_copy);
|
||||||
}
|
}
|
||||||
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
|
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
|
||||||
} else if (schema.contains("const")) {
|
} else if (schema.contains("const")) {
|
||||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
|
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
|
||||||
} else if (schema.contains("enum")) {
|
} else if (schema.contains("enum")) {
|
||||||
std::vector<std::string> enum_values;
|
std::vector<std::string> enum_values;
|
||||||
for (const auto & v : schema["enum"]) {
|
for (const auto & v : schema["enum"]) {
|
||||||
enum_values.push_back(_generate_constant_rule(v));
|
enum_values.push_back(_generate_constant_rule(v));
|
||||||
}
|
}
|
||||||
return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
|
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
|
||||||
} else if ((schema_type.is_null() || schema_type == "object")
|
} else if ((schema_type.is_null() || schema_type == "object")
|
||||||
&& (schema.contains("properties") ||
|
&& (schema.contains("properties") ||
|
||||||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
||||||
|
@ -724,6 +987,24 @@ public:
|
||||||
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
||||||
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
||||||
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
||||||
|
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
||||||
|
int min_value = std::numeric_limits<int>::min();
|
||||||
|
int max_value = std::numeric_limits<int>::max();
|
||||||
|
if (schema.contains("minimum")) {
|
||||||
|
min_value = schema["minimum"].get<int>();
|
||||||
|
} else if (schema.contains("exclusiveMinimum")) {
|
||||||
|
min_value = schema["exclusiveMinimum"].get<int>() + 1;
|
||||||
|
}
|
||||||
|
if (schema.contains("maximum")) {
|
||||||
|
max_value = schema["maximum"].get<int>();
|
||||||
|
} else if (schema.contains("exclusiveMaximum")) {
|
||||||
|
max_value = schema["exclusiveMaximum"].get<int>() - 1;
|
||||||
|
}
|
||||||
|
std::stringstream out;
|
||||||
|
out << "(";
|
||||||
|
_build_min_max_int(min_value, max_value, out);
|
||||||
|
out << ") space";
|
||||||
|
return _add_rule(rule_name, out.str());
|
||||||
} else if (schema.empty() || schema_type == "object") {
|
} else if (schema.empty() || schema_type == "object") {
|
||||||
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
||||||
|
|
||||||
result->grammar = llama_grammar_init(
|
struct llama_grammar * grammar = llama_grammar_init(
|
||||||
grammar_rules.data(),
|
grammar_rules.data(),
|
||||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
|
result->grammar = grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
result->prev.resize(params.n_prev);
|
||||||
|
@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
if (!ctx->parsed_grammar.rules.empty()) {
|
if (!ctx->parsed_grammar.rules.empty()) {
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
||||||
|
|
||||||
ctx->grammar = llama_grammar_init(
|
struct llama_grammar * grammar = llama_grammar_init(
|
||||||
grammar_rules.data(),
|
grammar_rules.data(),
|
||||||
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
|
ctx->grammar = grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||||
|
@ -125,7 +133,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
std::string result = "CFG -> Penalties ";
|
std::string result = "CFG -> Penalties ";
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (auto sampler_type : params.samplers_sequence) {
|
for (auto sampler_type : params.samplers_sequence) {
|
||||||
const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
|
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
|
||||||
if (!sampler_type_name.empty()) {
|
if (!sampler_type_name.empty()) {
|
||||||
result += "-> " + sampler_type_name + " ";
|
result += "-> " + sampler_type_name + " ";
|
||||||
}
|
}
|
||||||
|
@ -137,6 +145,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
||||||
|
switch (sampler_type) {
|
||||||
|
case llama_sampler_type::TOP_K: return "top_k";
|
||||||
|
case llama_sampler_type::TFS_Z: return "tfs_z";
|
||||||
|
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
||||||
|
case llama_sampler_type::TOP_P: return "top_p";
|
||||||
|
case llama_sampler_type::MIN_P: return "min_p";
|
||||||
|
case llama_sampler_type::TEMPERATURE: return "temperature";
|
||||||
|
default : return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||||
|
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
||||||
|
{"top_k", llama_sampler_type::TOP_K},
|
||||||
|
{"top_p", llama_sampler_type::TOP_P},
|
||||||
|
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"min_p", llama_sampler_type::MIN_P},
|
||||||
|
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||||
|
{"temperature", llama_sampler_type::TEMPERATURE}
|
||||||
|
};
|
||||||
|
|
||||||
|
// since samplers names are written multiple ways
|
||||||
|
// make it ready for both system names and input names
|
||||||
|
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
|
||||||
|
{"top-k", llama_sampler_type::TOP_K},
|
||||||
|
{"top-p", llama_sampler_type::TOP_P},
|
||||||
|
{"nucleus", llama_sampler_type::TOP_P},
|
||||||
|
{"typical-p", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"typical", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"min-p", llama_sampler_type::MIN_P},
|
||||||
|
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||||
|
{"tfs", llama_sampler_type::TFS_Z},
|
||||||
|
{"temp", llama_sampler_type::TEMPERATURE}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
|
sampler_types.reserve(names.size());
|
||||||
|
for (const auto & name : names)
|
||||||
|
{
|
||||||
|
auto sampler_item = sampler_canonical_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_canonical_name_map.end())
|
||||||
|
{
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (allow_alt_names)
|
||||||
|
{
|
||||||
|
sampler_item = sampler_alt_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_alt_name_map.end())
|
||||||
|
{
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
|
||||||
|
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
||||||
|
{'k', llama_sampler_type::TOP_K},
|
||||||
|
{'p', llama_sampler_type::TOP_P},
|
||||||
|
{'y', llama_sampler_type::TYPICAL_P},
|
||||||
|
{'m', llama_sampler_type::MIN_P},
|
||||||
|
{'f', llama_sampler_type::TFS_Z},
|
||||||
|
{'t', llama_sampler_type::TEMPERATURE}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
|
sampler_types.reserve(names_string.size());
|
||||||
|
for (const auto & c : names_string) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(c);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
// no reasons to expose this function in header
|
// no reasons to expose this function in header
|
||||||
static void sampler_queue(
|
static void sampler_queue(
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
|
@ -179,7 +268,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
const int idx,
|
const int idx,
|
||||||
bool is_resampling) { // Add a parameter to indicate if we are resampling
|
bool is_resampling) {
|
||||||
const llama_sampling_params & params = ctx_sampling->params;
|
const llama_sampling_params & params = ctx_sampling->params;
|
||||||
|
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
|
@ -188,8 +277,8 @@ static llama_token llama_sampling_sample_impl(
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
const float mirostat_eta = params.mirostat_eta;
|
||||||
|
|
||||||
std::vector<float> original_logits;
|
std::vector<float> original_logits;
|
||||||
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
|
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
|
||||||
if (!is_resampling) {
|
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
||||||
GGML_ASSERT(!original_logits.empty());
|
GGML_ASSERT(!original_logits.empty());
|
||||||
}
|
}
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
|
@ -252,7 +341,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
// Restore logits from the copy
|
// Restore logits from the copy
|
||||||
std::copy(original_logits.begin(), original_logits.end(), logits);
|
std::copy(original_logits.begin(), original_logits.end(), logits);
|
||||||
|
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -285,7 +374,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
||||||
// Get a pointer to the logits
|
// Get a pointer to the logits
|
||||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||||
|
|
||||||
if (apply_grammar && original_logits != NULL) {
|
if (ctx_sampling->grammar != NULL && !apply_grammar) {
|
||||||
|
GGML_ASSERT(original_logits != NULL);
|
||||||
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
||||||
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
|
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
|
||||||
}
|
}
|
||||||
|
@ -342,7 +432,7 @@ llama_token llama_sampling_sample(
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
const int idx) {
|
const int idx) {
|
||||||
// Call the implementation function with is_resampling set to false by default
|
// Call the implementation function with is_resampling set to false by default
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array llama_sampling_prepare(
|
llama_token_data_array llama_sampling_prepare(
|
||||||
|
|
|
@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
|
||||||
// Print sampling order into a string
|
// Print sampling order into a string
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
||||||
|
|
||||||
|
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
|
||||||
|
|
||||||
// this is a common sampling function used across the examples for convenience
|
// this is a common sampling function used across the examples for convenience
|
||||||
// it can serve as a starting point for implementing your own sampling function
|
// it can serve as a starting point for implementing your own sampling function
|
||||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||||
|
|
|
@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
||||||
|
|
||||||
params.custom_n_ctx = false;
|
params.custom_n_ctx = false;
|
||||||
|
|
||||||
params.use_flash = true;
|
params.use_flash = false;
|
||||||
params.use_checkpointing = true;
|
params.use_checkpointing = true;
|
||||||
|
|
||||||
params.sample_start = "";
|
params.sample_start = "";
|
||||||
|
@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
|
||||||
|
|
||||||
void finish_processing_train_args(struct train_params_common * params) {
|
void finish_processing_train_args(struct train_params_common * params) {
|
||||||
if (params->escape) {
|
if (params->escape) {
|
||||||
process_escapes(params->sample_start);
|
string_process_escapes(params->sample_start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# This script downloads the tokenizer models of the specified models from Huggingface and
|
# This script downloads the tokenizer models of the specified models from Huggingface and
|
||||||
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
||||||
|
@ -72,6 +73,7 @@ models = [
|
||||||
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||||
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||||
|
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
|
||||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||||
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
||||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||||
|
@ -80,6 +82,10 @@ models = [
|
||||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
|
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||||
|
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||||
|
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -209,7 +215,7 @@ src_func = f"""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||||
convert_py = convert_py_pth.read_text()
|
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||||
convert_py = re.sub(
|
convert_py = re.sub(
|
||||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||||
lambda m: m.group(1) + src_func + m.group(3),
|
lambda m: m.group(1) + src_func + m.group(3),
|
||||||
|
@ -217,7 +223,7 @@ convert_py = re.sub(
|
||||||
flags=re.DOTALL | re.MULTILINE,
|
flags=re.DOTALL | re.MULTILINE,
|
||||||
)
|
)
|
||||||
|
|
||||||
convert_py_pth.write_text(convert_py)
|
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
||||||
|
|
||||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,143 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
logger = logging.getLogger("persimmon-to-gguf")
|
|
||||||
|
|
||||||
|
|
||||||
def _flatten_dict(dct, tensors, prefix=None):
|
|
||||||
assert isinstance(dct, dict)
|
|
||||||
for key in dct.keys():
|
|
||||||
new_prefix = prefix + '.' + key if prefix is not None else key
|
|
||||||
if isinstance(dct[key], torch.Tensor):
|
|
||||||
tensors[new_prefix] = dct[key]
|
|
||||||
elif isinstance(dct[key], dict):
|
|
||||||
_flatten_dict(dct[key], tensors, new_prefix)
|
|
||||||
else:
|
|
||||||
raise ValueError(type(dct[key]))
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
|
||||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
|
||||||
logger.info('getting sentencepiece tokenizer from', tokenizer_path)
|
|
||||||
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
|
||||||
logger.info('adding tokens')
|
|
||||||
tokens: list[bytes] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
text: bytes
|
|
||||||
score: float
|
|
||||||
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
text = piece.encode("utf-8")
|
|
||||||
score = tokenizer.get_score(i)
|
|
||||||
|
|
||||||
toktype = 1
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
toktype = 2
|
|
||||||
if tokenizer.is_control(i):
|
|
||||||
toktype = 3
|
|
||||||
if tokenizer.is_unused(i):
|
|
||||||
toktype = 5
|
|
||||||
if tokenizer.is_byte(i):
|
|
||||||
toktype = 6
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
scores.append(score)
|
|
||||||
toktypes.append(toktype)
|
|
||||||
pass
|
|
||||||
return tokens, scores, toktypes
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
|
||||||
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
|
|
||||||
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
|
|
||||||
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
|
|
||||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
|
||||||
args = parser.parse_args()
|
|
||||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
|
||||||
sys.path.append(str(args.adept_inference_dir))
|
|
||||||
persimmon_model = torch.load(args.ckpt_path)
|
|
||||||
hparams = persimmon_model['args']
|
|
||||||
pprint(hparams)
|
|
||||||
tensors: dict[str, torch.Tensor] = {}
|
|
||||||
_flatten_dict(persimmon_model['model'], tensors, None)
|
|
||||||
|
|
||||||
arch = gguf.MODEL_ARCH.PERSIMMON
|
|
||||||
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
|
|
||||||
|
|
||||||
block_count = hparams.num_layers
|
|
||||||
head_count = hparams.num_attention_heads
|
|
||||||
head_count_kv = head_count
|
|
||||||
ctx_length = hparams.seq_length
|
|
||||||
hidden_size = hparams.hidden_size
|
|
||||||
|
|
||||||
gguf_writer.add_name('persimmon-8b-chat')
|
|
||||||
gguf_writer.add_context_length(ctx_length)
|
|
||||||
gguf_writer.add_embedding_length(hidden_size)
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
|
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
|
|
||||||
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
|
|
||||||
gguf_writer.add_head_count(head_count)
|
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
|
|
||||||
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
|
|
||||||
|
|
||||||
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
|
|
||||||
gguf_writer.add_tokenizer_model('llama')
|
|
||||||
gguf_writer.add_tokenizer_pre('default')
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
gguf_writer.add_bos_token_id(71013)
|
|
||||||
gguf_writer.add_eos_token_id(71013)
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
|
||||||
logger.info(tensor_map)
|
|
||||||
for name in tensors.keys():
|
|
||||||
data_torch = tensors[name]
|
|
||||||
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
|
||||||
continue
|
|
||||||
old_dtype = data_torch.dtype
|
|
||||||
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
|
||||||
data = data_torch.to(torch.float32).squeeze().numpy()
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
raise ValueError(f"Can not map tensor '{name}'")
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
logger.info("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
logger.info("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
logger.info("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
logger.info(f"gguf: model successfully exported to '{args.outfile}'")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used.
|
||||||
Makefile:
|
Makefile:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_BLIS=1 -j
|
make GGML_BLIS=1 -j
|
||||||
# make LLAMA_BLIS=1 benchmark-matmult
|
# make GGML_BLIS=1 llama-benchmark-matmult
|
||||||
```
|
```
|
||||||
|
|
||||||
CMake:
|
CMake:
|
||||||
|
@ -39,7 +39,7 @@ CMake:
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
|
cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
|
||||||
make -j
|
make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
|
||||||
### 1. Convert the model to GGUF
|
### 1. Convert the model to GGUF
|
||||||
|
|
||||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||||
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
|
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
|
||||||
|
|
||||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||||
|
|
||||||
|
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
## GGUF specification
|
## GGUF specification
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
## Verifying that the model is running on the GPU with CUDA
|
## Verifying that the model is running on the GPU with CUDA
|
||||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||||
```shell
|
```shell
|
||||||
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||||
```
|
```
|
||||||
|
|
||||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||||
|
@ -27,7 +27,7 @@ RAM: 32GB
|
||||||
|
|
||||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
||||||
|
|
||||||
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||||
|
|
||||||
Result:
|
Result:
|
||||||
|
|
||||||
|
|
|
@ -12,44 +12,45 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
|
add_subdirectory(cvector-generator)
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
add_subdirectory(batched)
|
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(beam-search)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
add_subdirectory(export-lora)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gbnf-validator)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
add_subdirectory(gguf)
|
||||||
|
add_subdirectory(gritlm)
|
||||||
|
add_subdirectory(imatrix)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
if (LLAMA_SYCL)
|
|
||||||
add_subdirectory(sycl)
|
|
||||||
endif()
|
|
||||||
add_subdirectory(main)
|
|
||||||
add_subdirectory(tokenize)
|
|
||||||
add_subdirectory(parallel)
|
|
||||||
add_subdirectory(perplexity)
|
|
||||||
add_subdirectory(quantize)
|
|
||||||
add_subdirectory(quantize-stats)
|
|
||||||
add_subdirectory(retrieval)
|
|
||||||
add_subdirectory(save-load-state)
|
|
||||||
add_subdirectory(simple)
|
|
||||||
add_subdirectory(passkey)
|
|
||||||
add_subdirectory(speculative)
|
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(gguf)
|
add_subdirectory(main)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(passkey)
|
||||||
|
add_subdirectory(perplexity)
|
||||||
|
add_subdirectory(quantize-stats)
|
||||||
|
add_subdirectory(quantize)
|
||||||
|
add_subdirectory(retrieval)
|
||||||
|
if (GGML_RPC)
|
||||||
|
add_subdirectory(rpc)
|
||||||
|
endif()
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (LLAMA_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
add_subdirectory(export-lora)
|
if (GGML_SYCL)
|
||||||
if (LLAMA_RPC)
|
add_subdirectory(sycl)
|
||||||
add_subdirectory(rpc)
|
|
||||||
endif()
|
endif()
|
||||||
|
add_subdirectory(save-load-state)
|
||||||
|
add_subdirectory(simple)
|
||||||
|
add_subdirectory(speculative)
|
||||||
|
add_subdirectory(tokenize)
|
||||||
|
add_subdirectory(train-text-from-scratch)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./llama-cli "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--in-prefix " " \
|
--in-prefix " " \
|
||||||
--in-suffix "${AI_NAME}:" \
|
--in-suffix "${AI_NAME}:" \
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#
|
|
||||||
# Temporary script - will be removed in the future
|
|
||||||
#
|
|
||||||
|
|
||||||
cd `dirname $0`
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
|
||||||
--color \
|
|
||||||
-f ./prompts/alpaca.txt \
|
|
||||||
--ctx_size 2048 \
|
|
||||||
-n -1 \
|
|
||||||
-ins -b 256 \
|
|
||||||
--top_k 10000 \
|
|
||||||
--temp 0.2 \
|
|
||||||
--repeat_penalty 1.1 \
|
|
||||||
-t 7
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET baby-llama)
|
set(TARGET llama-baby-llama)
|
||||||
add_executable(${TARGET} baby-llama.cpp)
|
add_executable(${TARGET} baby-llama.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
|
||||||
// wk shape [n_embd, n_embd, 1, 1]
|
// wk shape [n_embd, n_embd, 1, 1]
|
||||||
// Qcur shape [n_embd/n_head, n_head, N, 1]
|
// Qcur shape [n_embd/n_head, n_head, N, 1]
|
||||||
// Kcur shape [n_embd/n_head, n_head, N, 1]
|
// Kcur shape [n_embd/n_head, n_head, N, 1]
|
||||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
|
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
|
||||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
|
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
|
@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
|
||||||
// wk shape [n_embd, n_embd, 1, 1]
|
// wk shape [n_embd, n_embd, 1, 1]
|
||||||
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||||
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
|
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
|
||||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
|
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
|
||||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||||
|
|
||||||
|
@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
|
||||||
model->layers[il].wqb,
|
model->layers[il].wqb,
|
||||||
cur)),
|
cur)),
|
||||||
n_embd/n_head, n_head, N),
|
n_embd/n_head, n_head, N),
|
||||||
KQ_pos, n_rot, 0, 0);
|
KQ_pos, n_rot, 0);
|
||||||
struct ggml_tensor * Kcur = ggml_rope(ctx0,
|
struct ggml_tensor * Kcur = ggml_rope(ctx0,
|
||||||
ggml_reshape_3d(ctx0,
|
ggml_reshape_3d(ctx0,
|
||||||
ggml_mul_mat(ctx0,
|
ggml_mul_mat(ctx0,
|
||||||
|
@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
|
||||||
model->layers[il].wkb,
|
model->layers[il].wkb,
|
||||||
cur)),
|
cur)),
|
||||||
n_embd/n_head, n_head, N),
|
n_embd/n_head, n_head, N),
|
||||||
KQ_pos, n_rot, 0, 0);
|
KQ_pos, n_rot, 0);
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
|
|
|
@ -58,4 +58,4 @@ echo "$2
|
||||||
model=$1
|
model=$1
|
||||||
|
|
||||||
# generate the most likely continuation until the string "===" is found
|
# generate the most likely continuation until the string "===" is found
|
||||||
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched-bench)
|
set(TARGET llama-batched-bench)
|
||||||
add_executable(${TARGET} batched-bench.cpp)
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
||||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||||
|
|
||||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||||
|
|
||||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||||
|
|
||||||
# custom set of batches
|
# custom set of batches
|
||||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sample results
|
## Sample results
|
||||||
|
|
|
@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
|
LOG_TEE("\nexample usage:\n");
|
||||||
|
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||||
|
LOG_TEE("\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (argc == 1 || argv[1][0] == '-') {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
print_usage(argc, argv, params);
|
||||||
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
|
||||||
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_kv_max = 2048;
|
int is_pp_shared = params.is_pp_shared;
|
||||||
int n_batch = 2048;
|
|
||||||
int n_ubatch = 512;
|
|
||||||
bool flash_attn = false;
|
|
||||||
int is_pp_shared = 0;
|
|
||||||
int n_gpu_layers = 0;
|
|
||||||
|
|
||||||
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
|
std::vector<int> n_pp = params.n_pp;
|
||||||
std::vector<int> n_tg = { 128, 256, };
|
std::vector<int> n_tg = params.n_tg;
|
||||||
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
|
std::vector<int> n_pl = params.n_pl;
|
||||||
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
|
|
||||||
|
|
||||||
if (argc >= 2) {
|
|
||||||
params.model = argv[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 3) {
|
|
||||||
n_kv_max = std::atoi(argv[2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 4) {
|
|
||||||
n_batch = std::atoi(argv[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 5) {
|
|
||||||
n_ubatch = std::atoi(argv[4]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 6) {
|
|
||||||
flash_attn = std::atoi(argv[5]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 7) {
|
|
||||||
is_pp_shared = std::atoi(argv[6]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 8) {
|
|
||||||
n_gpu_layers = std::atoi(argv[7]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 9) {
|
|
||||||
n_pp = parse_list(argv[8]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 10) {
|
|
||||||
n_tg = parse_list(argv[9]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 11) {
|
|
||||||
n_pl = parse_list(argv[10]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// init LLM
|
// init LLM
|
||||||
|
|
||||||
|
@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
const std::vector<float> t_split(llama_max_devices(), 0.0f);
|
|
||||||
|
|
||||||
model_params.n_gpu_layers = n_gpu_layers;
|
|
||||||
model_params.tensor_split = t_split.data();
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
ctx_params.seed = 1234;
|
|
||||||
ctx_params.n_ctx = n_kv_max;
|
|
||||||
ctx_params.n_batch = n_batch;
|
|
||||||
ctx_params.n_ubatch = n_ubatch;
|
|
||||||
ctx_params.flash_attn = flash_attn;
|
|
||||||
|
|
||||||
ctx_params.n_threads = params.n_threads;
|
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
||||||
|
|
||||||
// ensure enough sequences are available
|
// ensure enough sequences are available
|
||||||
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
||||||
|
@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int32_t n_kv_max = llama_n_ctx(ctx);
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||||
|
|
||||||
// decode in batches of ctx_params.n_batch tokens
|
// decode in batches of ctx_params.n_batch tokens
|
||||||
|
@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
.PHONY: build
|
.PHONY: build
|
||||||
|
|
||||||
build:
|
build:
|
||||||
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
rm -f ./batched_swift
|
rm -f ./llama-batched-swift
|
||||||
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
platforms: [.macOS(.v12)],
|
platforms: [.macOS(.v12)],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(name: "llama", path: "../../"),
|
.package(name: "llama", path: "../../"),
|
||||||
|
@ -13,7 +13,7 @@ let package = Package(
|
||||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
// Targets can depend on other targets in this package and products from dependencies.
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
.executableTarget(
|
.executableTarget(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
dependencies: ["llama"],
|
dependencies: ["llama"],
|
||||||
path: "Sources",
|
path: "Sources",
|
||||||
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
This is a swift clone of `examples/batched`.
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
$ `make`
|
$ `make`
|
||||||
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched)
|
set(TARGET llama-batched)
|
||||||
add_executable(${TARGET} batched.cpp)
|
add_executable(${TARGET} batched.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
The example demonstrates batched generation from a given prompt
|
The example demonstrates batched generation from a given prompt
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
|
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -7,48 +7,31 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
|
LOG_TEE("\nexample usage:\n");
|
||||||
|
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||||
|
LOG_TEE("\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (argc == 1 || argv[1][0] == '-') {
|
params.prompt = "Hello my name is";
|
||||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
|
params.n_predict = 32;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
print_usage(argc, argv, params);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// number of parallel batches
|
// number of parallel batches
|
||||||
int n_parallel = 1;
|
int n_parallel = params.n_parallel;
|
||||||
|
|
||||||
// total length of the sequences including the prompt
|
// total length of the sequences including the prompt
|
||||||
int n_len = 32;
|
int n_predict = 32;
|
||||||
|
|
||||||
// number of layers to offload to the GPU
|
|
||||||
int n_gpu_layers = 0;
|
|
||||||
|
|
||||||
if (argc >= 2) {
|
|
||||||
params.model = argv[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 3) {
|
|
||||||
params.prompt = argv[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 4) {
|
|
||||||
n_parallel = std::atoi(argv[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 5) {
|
|
||||||
n_len = std::atoi(argv[4]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 6) {
|
|
||||||
n_gpu_layers = std::atoi(argv[5]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.prompt.empty()) {
|
|
||||||
params.prompt = "Hello my name is";
|
|
||||||
}
|
|
||||||
|
|
||||||
process_escapes(params.prompt);
|
|
||||||
|
|
||||||
// init LLM
|
// init LLM
|
||||||
|
|
||||||
|
@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
model_params.n_gpu_layers = n_gpu_layers;
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
||||||
|
|
||||||
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
|
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
||||||
|
|
||||||
// initialize the context
|
// initialize the context
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
ctx_params.seed = 1234;
|
|
||||||
ctx_params.n_ctx = n_kv_req;
|
ctx_params.n_ctx = n_kv_req;
|
||||||
ctx_params.n_batch = std::max(n_len, n_parallel);
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
||||||
ctx_params.n_seq_max = n_parallel;
|
|
||||||
ctx_params.n_threads = params.n_threads;
|
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
@ -95,7 +72,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||||
|
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||||
if (n_kv_req > n_ctx) {
|
if (n_kv_req > n_ctx) {
|
||||||
|
@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
while (n_cur <= n_len) {
|
while (n_cur <= n_predict) {
|
||||||
// prepare the next batch
|
// prepare the next batch
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
|
||||||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
||||||
// is it an end of generation? -> mark the stream as finished
|
// is it an end of generation? -> mark the stream as finished
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
i_batch[i] = -1;
|
i_batch[i] = -1;
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
if (n_parallel > 1) {
|
if (n_parallel > 1) {
|
||||||
|
|
|
@ -1,188 +0,0 @@
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cinttypes>
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <ctime>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
||||||
#include <signal.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#elif defined (_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#include <signal.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Used for debugging to print out beam tokens.
|
|
||||||
struct ostream_beam_view {
|
|
||||||
llama_context * ctx;
|
|
||||||
llama_beam_view beam_view;
|
|
||||||
};
|
|
||||||
|
|
||||||
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
|
||||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
|
||||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
|
||||||
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
|
||||||
}
|
|
||||||
return os << ')';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Put here anything you want back in beam_search_callback().
|
|
||||||
struct beam_search_callback_data {
|
|
||||||
llama_context * ctx;
|
|
||||||
std::vector<llama_token> response;
|
|
||||||
};
|
|
||||||
|
|
||||||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
|
||||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
|
||||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
|
||||||
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Function matching type llama_beam_search_callback_fn_t.
|
|
||||||
// Custom callback example is called each time the beams lengths increase:
|
|
||||||
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
|
||||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
|
||||||
// This is also called when the stop condition is met.
|
|
||||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
|
||||||
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
|
||||||
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
|
||||||
// Mark beams as EOS as needed.
|
|
||||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
||||||
llama_beam_view& beam_view = beams_state.beam_views[i];
|
|
||||||
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
|
||||||
beam_view.eob = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf(","); // Show progress
|
|
||||||
if (const size_t n = beams_state.common_prefix_length) {
|
|
||||||
callback_data.response.resize(callback_data.response.size() + n);
|
|
||||||
assert(0u < beams_state.n_beams);
|
|
||||||
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
|
||||||
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
|
||||||
printf("%zu", n);
|
|
||||||
}
|
|
||||||
fflush(stdout);
|
|
||||||
#if 1 // DEBUG: print current beams for this iteration
|
|
||||||
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
|
||||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
||||||
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv)
|
|
||||||
{
|
|
||||||
gpt_params params;
|
|
||||||
//params.n_gpu_layers = 200;
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Print help :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
if ( argc < 2 || argv[1][0] == '-' )
|
|
||||||
{
|
|
||||||
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
|
||||||
return 1 ;
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Load parameters :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
params.model = argv[1];
|
|
||||||
|
|
||||||
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
|
||||||
|
|
||||||
if ( argc > 3 )
|
|
||||||
{
|
|
||||||
params.prompt = argv[3];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( params.prompt.empty() )
|
|
||||||
{
|
|
||||||
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Init LLM :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
llama_backend_init();
|
|
||||||
llama_numa_init(params.numa);
|
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
|
||||||
|
|
||||||
if ( model == NULL )
|
|
||||||
{
|
|
||||||
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Tokenize the prompt :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
|
||||||
|
|
||||||
const size_t max_context_size = llama_n_ctx( ctx );
|
|
||||||
const size_t max_tokens_list_size = max_context_size - 4 ;
|
|
||||||
|
|
||||||
if (tokens_list.size() > max_tokens_list_size)
|
|
||||||
{
|
|
||||||
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
|
||||||
__func__ , tokens_list.size() , max_tokens_list_size );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf( stderr, "\n\n" );
|
|
||||||
|
|
||||||
// Print the tokens from the prompt :
|
|
||||||
|
|
||||||
for( auto id : tokens_list )
|
|
||||||
{
|
|
||||||
std::cout << llama_token_to_piece(ctx, id);
|
|
||||||
}
|
|
||||||
std::cout << std::flush;
|
|
||||||
|
|
||||||
int n_past = 0;
|
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
|
|
||||||
{
|
|
||||||
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
n_past += tokens_list.size();
|
|
||||||
|
|
||||||
beam_search_callback_data callback_data{ctx, {}};
|
|
||||||
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
|
||||||
int const n_predict = 256;
|
|
||||||
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
|
|
||||||
|
|
||||||
std::cout << "\n\n";
|
|
||||||
for (llama_token const token_id : callback_data.response) {
|
|
||||||
std::cout << llama_token_to_piece(ctx,token_id);
|
|
||||||
}
|
|
||||||
std::cout << std::endl;
|
|
||||||
|
|
||||||
llama_free( ctx );
|
|
||||||
llama_free_model( model );
|
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET benchmark)
|
set(TARGET llama-bench-matmult)
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./main $GEN_OPTIONS \
|
./llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -62,7 +62,7 @@ fi
|
||||||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||||
echo 'Prompt cache does not exist, building...'
|
echo 'Prompt cache does not exist, building...'
|
||||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||||
./main 2>>"$LOG" \
|
./llama-cli 2>>"$LOG" \
|
||||||
--batch_size 64 \
|
--batch_size 64 \
|
||||||
"${OPTS[@]}" \
|
"${OPTS[@]}" \
|
||||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||||
|
@ -109,13 +109,13 @@ while read -e line; do
|
||||||
|
|
||||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||||
|
|
||||||
./main 2>>"$LOG" "${OPTS[@]}" \
|
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
|
||||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||||
--prompt-cache-all \
|
--prompt-cache-all \
|
||||||
--file "$CUR_PROMPT_FILE" \
|
--file "$CUR_PROMPT_FILE" \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--n_predict "$n_predict" |
|
--n_predict "$n_predict" |
|
||||||
skip_bytes 1 | # skip BOS token added by ./main
|
skip_bytes 1 | # skip BOS token added by ./llama-cli
|
||||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||||
skip_bytes "$n_prompt_len_pre" # print generation
|
skip_bytes "$n_prompt_len_pre" # print generation
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ while read -e line; do
|
||||||
# TODO get both messages in one go
|
# TODO get both messages in one go
|
||||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ while read -e line; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Update cache for next prompt in background, ideally during user input
|
# Update cache for next prompt in background, ideally during user input
|
||||||
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||||
--file "$NEXT_PROMPT_FILE" \
|
--file "$NEXT_PROMPT_FILE" \
|
||||||
--n_predict 1 &
|
--n_predict 1 &
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./bin/main $GEN_OPTIONS \
|
./bin/llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -11,6 +11,6 @@ cd ..
|
||||||
#
|
#
|
||||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||||
#
|
#
|
||||||
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||||
--repeat_penalty 1.0 --color -i \
|
--repeat_penalty 1.0 --color -i \
|
||||||
-r "User:" -f prompts/chat-with-bob.txt
|
-r "User:" -f prompts/chat-with-bob.txt
|
||||||
|
|
|
@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
|
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
# use .parent.parent since we are in "examples" directory
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing_extensions import Self, TypeAlias
|
from typing_extensions import Self, TypeAlias
|
||||||
|
@ -174,7 +176,7 @@ class Params:
|
||||||
rope_scaling_type: gguf.RopeScalingType | None = None
|
rope_scaling_type: gguf.RopeScalingType | None = None
|
||||||
f_rope_freq_base: float | None = None
|
f_rope_freq_base: float | None = None
|
||||||
f_rope_scale: float | None = None
|
f_rope_scale: float | None = None
|
||||||
n_orig_ctx: int | None = None
|
n_ctx_orig: int | None = None
|
||||||
rope_finetuned: bool | None = None
|
rope_finetuned: bool | None = None
|
||||||
|
|
||||||
ftype: GGMLFileType | None = None
|
ftype: GGMLFileType | None = None
|
||||||
|
@ -224,7 +226,7 @@ class Params:
|
||||||
with open(config_path) as f:
|
with open(config_path) as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
|
||||||
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
|
||||||
rope_scaling = config.get("rope_scaling")
|
rope_scaling = config.get("rope_scaling")
|
||||||
|
|
||||||
if rope_scaling is not None and (typ := rope_scaling.get("type")):
|
if rope_scaling is not None and (typ := rope_scaling.get("type")):
|
||||||
|
@ -234,7 +236,7 @@ class Params:
|
||||||
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
||||||
elif typ == "yarn":
|
elif typ == "yarn":
|
||||||
rope_scaling_type = gguf.RopeScalingType.YARN
|
rope_scaling_type = gguf.RopeScalingType.YARN
|
||||||
n_orig_ctx = rope_scaling['original_max_position_embeddings']
|
n_ctx_orig = rope_scaling['original_max_position_embeddings']
|
||||||
rope_finetuned = rope_scaling['finetuned']
|
rope_finetuned = rope_scaling['finetuned']
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
||||||
|
@ -270,7 +272,7 @@ class Params:
|
||||||
f_rope_freq_base = config.get("rope_theta"),
|
f_rope_freq_base = config.get("rope_theta"),
|
||||||
rope_scaling_type = rope_scaling_type,
|
rope_scaling_type = rope_scaling_type,
|
||||||
f_rope_scale = f_rope_scale,
|
f_rope_scale = f_rope_scale,
|
||||||
n_orig_ctx = n_orig_ctx,
|
n_ctx_orig = n_ctx_orig,
|
||||||
rope_finetuned = rope_finetuned,
|
rope_finetuned = rope_finetuned,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -380,306 +382,6 @@ class Metadata:
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# vocab
|
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class BaseVocab(Protocol):
|
|
||||||
tokenizer_model: ClassVar[str]
|
|
||||||
name: ClassVar[str]
|
|
||||||
|
|
||||||
|
|
||||||
class NoVocab(BaseVocab):
|
|
||||||
tokenizer_model = "no_vocab"
|
|
||||||
name = "no_vocab"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return "<NoVocab for a model without integrated vocabulary>"
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class Vocab(BaseVocab, Protocol):
|
|
||||||
vocab_size: int
|
|
||||||
added_tokens_dict: dict[str, int]
|
|
||||||
added_tokens_list: list[str]
|
|
||||||
fname_tokenizer: Path
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path): ...
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
|
|
||||||
|
|
||||||
|
|
||||||
class BpeVocab(Vocab):
|
|
||||||
tokenizer_model = "gpt2"
|
|
||||||
name = "bpe"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
added_tokens: dict[str, int] = {}
|
|
||||||
|
|
||||||
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
|
||||||
# "slow" tokenizer
|
|
||||||
with open(fname_tokenizer, encoding="utf-8") as f:
|
|
||||||
self.vocab = json.load(f)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
|
||||||
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
|
||||||
added_tokens = json.load(f)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# "fast" tokenizer
|
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
|
||||||
|
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
|
||||||
with open(fname_tokenizer, encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
|
||||||
if (
|
|
||||||
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
|
||||||
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
|
||||||
):
|
|
||||||
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
|
||||||
|
|
||||||
self.vocab = tokenizer_model["vocab"]
|
|
||||||
|
|
||||||
if (added := tokenizer_json.get('added_tokens')) is not None:
|
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
|
||||||
added_tokens = {item['content']: item['id']
|
|
||||||
for item in added
|
|
||||||
if item['content'] not in self.vocab}
|
|
||||||
|
|
||||||
vocab_size = len(self.vocab)
|
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
|
||||||
actual_ids = sorted(added_tokens.values())
|
|
||||||
if expected_ids != actual_ids:
|
|
||||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
|
||||||
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
|
|
||||||
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
|
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
|
||||||
self.added_tokens_dict = added_tokens
|
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
|
||||||
self.vocab_size_base = vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
|
||||||
|
|
||||||
for i, _ in enumerate(self.vocab):
|
|
||||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
score = -1000.0
|
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.bpe_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab(Vocab):
|
|
||||||
tokenizer_model = "llama"
|
|
||||||
name = "spm"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
added_tokens: dict[str, int] = {}
|
|
||||||
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
|
||||||
# normal location
|
|
||||||
try:
|
|
||||||
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
|
||||||
added_tokens = json.load(f)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
|
||||||
# not found in alternate location either
|
|
||||||
raise FileNotFoundError('Cannot find tokenizer.model')
|
|
||||||
|
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor()
|
|
||||||
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
|
|
||||||
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
|
||||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
|
||||||
actual_new_ids = sorted(new_tokens.keys())
|
|
||||||
|
|
||||||
if expected_new_ids != actual_new_ids:
|
|
||||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
|
||||||
self.added_tokens_dict = added_tokens
|
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
|
||||||
self.vocab_size_base = vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
piece = tokenizer.IdToPiece(i)
|
|
||||||
text = piece.encode("utf-8")
|
|
||||||
score: float = tokenizer.GetScore(i)
|
|
||||||
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
|
||||||
if tokenizer.IsUnknown(i):
|
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
|
||||||
if tokenizer.IsControl(i):
|
|
||||||
toktype = gguf.TokenType.CONTROL
|
|
||||||
|
|
||||||
# NOTE: I think added_tokens are user defined.
|
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
|
||||||
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
|
||||||
|
|
||||||
if tokenizer.IsUnused(i):
|
|
||||||
toktype = gguf.TokenType.UNUSED
|
|
||||||
if tokenizer.IsByte(i):
|
|
||||||
toktype = gguf.TokenType.BYTE
|
|
||||||
|
|
||||||
yield text, score, toktype
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
score = -1000.0
|
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.sentencepiece_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
class LlamaHfVocab(Vocab):
|
|
||||||
tokenizer_model = "llama"
|
|
||||||
name = "hfft"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
|
||||||
with open(fname_tokenizer, encoding='utf-8') as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
# pre-check so we know if we need transformers
|
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
|
||||||
is_llama3 = (
|
|
||||||
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
|
|
||||||
and not tokenizer_model.get('byte_fallback', True)
|
|
||||||
)
|
|
||||||
if is_llama3:
|
|
||||||
raise TypeError('Llama 3 must be converted with BpeVocab')
|
|
||||||
|
|
||||||
if not is_llama3 and (
|
|
||||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
|
||||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
|
||||||
):
|
|
||||||
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
|
||||||
|
|
||||||
try:
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
except ImportError as e:
|
|
||||||
raise ImportError(
|
|
||||||
"To use LlamaHfVocab, please install the `transformers` package. "
|
|
||||||
"You can install it with `pip install transformers`."
|
|
||||||
) from e
|
|
||||||
|
|
||||||
# Allow the tokenizer to default to slow or fast versions.
|
|
||||||
# Explicitly set tokenizer to use local paths.
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
base_path,
|
|
||||||
cache_dir=base_path,
|
|
||||||
local_files_only=True,
|
|
||||||
)
|
|
||||||
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
|
||||||
|
|
||||||
# Initialize lists and dictionaries for added tokens
|
|
||||||
self.added_tokens_list = []
|
|
||||||
self.added_tokens_dict = dict()
|
|
||||||
self.added_tokens_ids = set()
|
|
||||||
|
|
||||||
# Process added tokens
|
|
||||||
for tok, tokidx in sorted(
|
|
||||||
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
|
|
||||||
):
|
|
||||||
# Only consider added tokens that are not in the base vocabulary
|
|
||||||
if tokidx >= self.tokenizer.vocab_size:
|
|
||||||
self.added_tokens_list.append(tok)
|
|
||||||
self.added_tokens_dict[tok] = tokidx
|
|
||||||
self.added_tokens_ids.add(tokidx)
|
|
||||||
|
|
||||||
# Store special tokens and their IDs
|
|
||||||
self.specials = {
|
|
||||||
tok: self.tokenizer.get_vocab()[tok]
|
|
||||||
for tok in self.tokenizer.all_special_tokens
|
|
||||||
}
|
|
||||||
self.special_ids = set(self.tokenizer.all_special_ids)
|
|
||||||
|
|
||||||
# Set vocabulary sizes
|
|
||||||
self.vocab_size_base = self.tokenizer.vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
reverse_vocab = {
|
|
||||||
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
|
|
||||||
}
|
|
||||||
|
|
||||||
for token_id in range(self.vocab_size_base):
|
|
||||||
# Skip processing added tokens here
|
|
||||||
if token_id in self.added_tokens_ids:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Convert token text to bytes
|
|
||||||
token_text = reverse_vocab[token_id].encode("utf-8")
|
|
||||||
|
|
||||||
# Yield token text, score, and type
|
|
||||||
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
|
||||||
token_id, token_text, self.special_ids # Reuse already stored special IDs
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
|
|
||||||
# Special case for byte tokens
|
|
||||||
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
|
|
||||||
return gguf.TokenType.BYTE
|
|
||||||
|
|
||||||
# Determine token type based on whether it's a special token
|
|
||||||
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
|
||||||
|
|
||||||
def get_token_score(self, token_id: int) -> float:
|
|
||||||
# Placeholder for actual logic to determine the token's score
|
|
||||||
# This needs to be implemented based on specific requirements
|
|
||||||
return -1000.0 # Default score
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
if text in self.specials:
|
|
||||||
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
|
|
||||||
score = self.get_token_score(self.specials[text])
|
|
||||||
else:
|
|
||||||
toktype = gguf.TokenType.USER_DEFINED
|
|
||||||
score = -1000.0
|
|
||||||
|
|
||||||
yield text.encode("utf-8"), score, toktype
|
|
||||||
|
|
||||||
def has_newline_token(self):
|
|
||||||
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.hf_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# data loading
|
# data loading
|
||||||
# TODO: reuse (probably move to gguf.py?)
|
# TODO: reuse (probably move to gguf.py?)
|
||||||
|
@ -1162,8 +864,8 @@ class OutputFile:
|
||||||
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
|
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
|
||||||
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
|
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
|
||||||
|
|
||||||
if params.n_orig_ctx is not None:
|
if params.n_ctx_orig is not None:
|
||||||
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
|
self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
|
||||||
|
|
||||||
if params.rope_finetuned is not None:
|
if params.rope_finetuned is not None:
|
||||||
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
|
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET convert-llama2c-to-ggml)
|
set(TARGET llama-convert-llama2c-to-ggml)
|
||||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue