Merge remote-tracking branch 'origin/master' into json-bounds2

This commit is contained in:
Olivier Chafik 2024-06-13 00:45:11 +01:00
commit f03e9b935b
175 changed files with 3867 additions and 2668 deletions

View file

@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
stage('Running llama.cpp'){ stage('Running llama.cpp'){
sh'''#!/bin/bash sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results cat llama_log.txt # Printing results
''' '''
} }

View file

@ -23,13 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA # Enable CUDA
ENV LLAMA_CUDA=1 ENV LLAMA_CUDA=1
RUN make -j$(nproc) main RUN make -j$(nproc) llama-cli
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/main /main COPY --from=build /app/llama-cli /llama-cli
ENTRYPOINT [ "/main" ] ENTRYPOINT [ "/llama-cli" ]

View file

@ -0,0 +1,26 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ]

View file

@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make -j$(nproc) main RUN make -j$(nproc) llama-cli
ENTRYPOINT [ "/app/main" ] ENTRYPOINT [ "/app/llama-cli" ]

View file

@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DLLAMA_VULKAN=1 && \ RUN cmake -B build -DLLAMA_VULKAN=1 && \
cmake --build build --config Release --target main cmake --build build --config Release --target llama-cli
# Clean up # Clean up
WORKDIR / WORKDIR /
RUN cp /app/build/bin/main /main && \ RUN cp /app/build/bin/llama-cli /llama-cli && \
rm -rf /app rm -rf /app
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ] ENTRYPOINT [ "/llama-cli" ]

View file

@ -9,15 +9,15 @@ WORKDIR /app
COPY . . COPY . .
RUN make -j$(nproc) main RUN make -j$(nproc) llama-cli
FROM ubuntu:$UBUNTU_VERSION as runtime FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/main /main COPY --from=build /app/llama-cli /llama-cli
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ] ENTRYPOINT [ "/llama-cli" ]

View file

@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamaclblast cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
cp -p server %{buildroot}%{_bindir}/llamaclblastserver cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
mkdir -p %{buildroot}/usr/lib/systemd/system mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=/etc/sysconfig/llama EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID ExecReload=/bin/kill -s HUP $MAINPID
Restart=never Restart=never
@ -67,9 +67,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/* rm -rf %{_builddir}/*
%files %files
%{_bindir}/llamaclblast %{_bindir}/llama-clblast-cli
%{_bindir}/llamaclblastserver %{_bindir}/llama-clblast-server
%{_bindir}/llamaclblastsimple %{_bindir}/llama-clblast-simple
/usr/lib/systemd/system/llamaclblast.service /usr/lib/systemd/system/llamaclblast.service
%config /etc/sysconfig/llama %config /etc/sysconfig/llama

View file

@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacppcuda cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
mkdir -p %{buildroot}/usr/lib/systemd/system mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=/etc/sysconfig/llama EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID ExecReload=/bin/kill -s HUP $MAINPID
Restart=never Restart=never
@ -67,9 +67,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/* rm -rf %{_builddir}/*
%files %files
%{_bindir}/llamacppcuda %{_bindir}/llama-cuda-cli
%{_bindir}/llamacppcudaserver %{_bindir}/llama-cuda-server
%{_bindir}/llamacppcudasimple %{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service /usr/lib/systemd/system/llamacuda.service
%config /etc/sysconfig/llama %config /etc/sysconfig/llama

View file

@ -38,9 +38,9 @@ make -j
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llama cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p server %{buildroot}%{_bindir}/llamaserver cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p simple %{buildroot}%{_bindir}/llamasimple cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
mkdir -p %{buildroot}/usr/lib/systemd/system mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=/etc/sysconfig/llama EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS ExecStart=/usr/bin/llama-server $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID ExecReload=/bin/kill -s HUP $MAINPID
Restart=never Restart=never
@ -69,9 +69,9 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/* rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama %{_bindir}/llama-cli
%{_bindir}/llamaserver %{_bindir}/llama-server
%{_bindir}/llamasimple %{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service /usr/lib/systemd/system/llama.service
%config /etc/sysconfig/llama %config /etc/sysconfig/llama

View file

@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
# Enable cURL # Enable cURL
ENV LLAMA_CURL=1 ENV LLAMA_CURL=1
RUN make -j$(nproc) server RUN make -j$(nproc) llama-server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server COPY --from=build /app/llama-server /llama-server
ENTRYPOINT [ "/server" ] ENTRYPOINT [ "/llama-server" ]

View file

@ -0,0 +1,29 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-server" ]

View file

@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev apt-get install -y libcurl4-openssl-dev
RUN make -j$(nproc) RUN make -j$(nproc) llama-server
ENTRYPOINT [ "/app/server" ] ENTRYPOINT [ "/app/llama-server" ]

View file

@ -19,13 +19,13 @@ RUN apt-get update && \
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \ RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release --target server cmake --build build --config Release --target llama-server
# Clean up # Clean up
WORKDIR / WORKDIR /
RUN cp /app/build/bin/server /server && \ RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app rm -rf /app
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ] ENTRYPOINT [ "/llama-server" ]

View file

@ -11,15 +11,15 @@ COPY . .
ENV LLAMA_CURL=1 ENV LLAMA_CURL=1
RUN make -j$(nproc) server RUN make -j$(nproc) llama-server
FROM ubuntu:$UBUNTU_VERSION as runtime FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server COPY --from=build /app/llama-server /llama-server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ] ENTRYPOINT [ "/llama-server" ]

View file

@ -1,34 +0,0 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build build --config Release --target main
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/main /main
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View file

@ -6,11 +6,11 @@
let let
inherit (config.packages) default; inherit (config.packages) default;
binaries = [ binaries = [
"llama" "llama-cli"
"llama-embedding" "llama-embedding"
"llama-server" "llama-server"
"quantize" "llama-quantize"
"train-text-from-scratch" "llama-train-text-from-scratch"
]; ];
mkApp = name: { mkApp = name: {
type = "app"; type = "app";

View file

@ -243,8 +243,6 @@ effectiveStdenv.mkDerivation (
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet. # if they haven't been added yet.
postInstall = '' postInstall = ''
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
mkdir -p $out/include mkdir -p $out/include
cp $src/llama.h $out/include/ cp $src/llama.h $out/include/
''; '';
@ -294,7 +292,7 @@ effectiveStdenv.mkDerivation (
license = lib.licenses.mit; license = lib.licenses.mit;
# Accommodates `nix run` and `lib.getExe` # Accommodates `nix run` and `lib.getExe`
mainProgram = "llama"; mainProgram = "llama-cli";
# These people might respond, on the best effort basis, if you ping them # These people might respond, on the best effort basis, if you ping them
# in case of Nix-specific regressions or for reviewing Nix-specific PRs. # in case of Nix-specific regressions or for reviewing Nix-specific PRs.

View file

@ -1,45 +0,0 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app
COPY . .
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]

View file

@ -10,11 +10,11 @@ shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert-hf-to-gguf.py "$@" python3 ./convert-hf-to-gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@" ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./main "$@" ./llama-cli "$@"
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
./finetune "$@" ./llama-finetune "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}" echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./quantize "$i" "${i/f16/q4_0}" q4_0 ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
fi fi
done done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./server "$@" ./llama-server "$@"
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "

View file

@ -12,8 +12,8 @@ build*/
models/* models/*
/main /llama-cli
/quantize /llama-quantize
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -24,7 +24,7 @@ body:
label: Name and Version label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string) description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: | placeholder: |
$./main --version $./llama-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations: validations:

View file

@ -24,7 +24,7 @@ body:
label: Name and Version label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string) description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: | placeholder: |
$./main --version $./llama-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations: validations:

View file

@ -24,7 +24,7 @@ body:
label: Name and Version label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string) description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: | placeholder: |
$./main --version $./llama-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations: validations:

View file

@ -24,7 +24,7 @@ body:
label: Name and Version label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string) description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: | placeholder: |
$./main --version $./llama-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations: validations:

5
.github/pull_request_template.md vendored Normal file
View file

@ -0,0 +1,5 @@
- Self Reported Review Complexity:
- [ ] Review Complexity : Low
- [ ] Review Complexity : Medium
- [ ] Review Complexity : High
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)

View file

@ -119,7 +119,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=OFF \ -DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \ -DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release; -DCMAKE_BUILD_TYPE=Release;
cmake --build build --config Release -j $(nproc) --target server cmake --build build --config Release -j $(nproc) --target llama-server
- name: Download the dataset - name: Download the dataset
id: download_dataset id: download_dataset

View file

@ -13,7 +13,7 @@ on:
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -103,12 +103,10 @@ jobs:
id: cmake_build id: cmake_build
run: | run: |
sysctl -a sysctl -a
mkdir build
cd build
# Metal is disabled due to intermittent failures with Github runners not having a GPU: # Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON .. cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
id: cmake_test id: cmake_test
@ -241,8 +239,8 @@ jobs:
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model" echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Determine tag name - name: Determine tag name
id: tag id: tag
@ -684,7 +682,7 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc) cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-2019
env: env:
OPENBLAS_VERSION: 0.3.23 OPENBLAS_VERSION: 0.3.23
@ -829,7 +827,7 @@ jobs:
name: llama-bin-win-${{ matrix.build }}.zip name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda: windows-latest-cmake-cuda:
runs-on: windows-latest runs-on: windows-2019
strategy: strategy:
matrix: matrix:
@ -843,8 +841,9 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- uses: Jimver/cuda-toolkit@v0.2.11 - name: Install CUDA toolkit
id: cuda-toolkit id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.15
with: with:
cuda: ${{ matrix.cuda }} cuda: ${{ matrix.cuda }}
method: 'network' method: 'network'

View file

@ -30,20 +30,20 @@ jobs:
strategy: strategy:
matrix: matrix:
config: config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why # have disabled them for now until the reason why
# is understood. # is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v4 uses: actions/checkout@v4

View file

@ -16,11 +16,9 @@ on:
branches: branches:
- master - master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request_target: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
schedule:
- cron: '2 4 * * *'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@ -98,7 +96,7 @@ jobs:
-DLLAMA_CURL=ON \ -DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
@ -115,7 +113,7 @@ jobs:
server-windows: server-windows:
runs-on: windows-latest runs-on: windows-2019
steps: steps:
- name: Clone - name: Clone
@ -138,7 +136,7 @@ jobs:
id: cmake_build id: cmake_build
run: | run: |
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup - name: Python setup
id: setup_python id: setup_python

43
.gitignore vendored
View file

@ -46,48 +46,9 @@ models/*
models-mnt models-mnt
/Pipfile /Pipfile
/baby-llama
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/eval-callback
/gguf
/gguf-llama-simple
/gguf-split
/gritlm
/imatrix
/infill
/libllama.so /libllama.so
/llama-bench /llama-*
/llava-cli llama-batched-swift
/lookahead
/lookup
/lookup-create
/lookup-merge
/lookup-stats
/main
/metal
/passkey
/perplexity
/q8dot
/quantize
/quantize-stats
/result
/save-load-state
/server
/simple
/batched
/batched-bench
/export-lora
/finetune
/retrieval
/speculative
/parallel
/train-text-from-scratch
/tokenize
/vdot
/common/build-info.cpp /common/build-info.cpp
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -402,12 +402,26 @@ if (LLAMA_CUBLAS)
endif() endif()
if (LLAMA_CUDA) if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17) cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
find_package(CUDAToolkit) find_package(CUDAToolkit)
if (CUDAToolkit_FOUND) if (CUDAToolkit_FOUND)
message(STATUS "CUDA found") message(STATUS "CUDA found")
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
enable_language(CUDA) enable_language(CUDA)
set(GGML_HEADERS_CUDA ggml-cuda.h) set(GGML_HEADERS_CUDA ggml-cuda.h)
@ -472,21 +486,6 @@ if (LLAMA_CUDA)
else() else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif() endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else() else()
message(WARNING "CUDA not found") message(WARNING "CUDA not found")
endif() endif()

14
CONTRIBUTING.md Normal file
View file

@ -0,0 +1,14 @@
# Contributing Guidelines
## Checklist
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
* Execute [the full CI locally on your machine](ci/README.md) before publishing
## PR formatting
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`

138
Makefile
View file

@ -1,8 +1,44 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = \ BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ libllava.a \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ llama-baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o llama-batched \
llama-batched-bench \
llama-bench \
llama-benchmark-matmult \
llama-cli \
llama-convert-llama2c-to-ggml \
llama-embedding \
llama-eval-callback \
llama-export-lora \
llama-finetune \
llama-gbnf-validator \
llama-gguf \
llama-gguf-split \
llama-gritlm \
llama-imatrix \
llama-infill \
llama-llava-cli \
llama-lookahead \
llama-lookup \
llama-lookup-create \
llama-lookup-merge \
llama-lookup-stats \
llama-parallel \
llama-passkey \
llama-perplexity \
llama-q8dot \
llama-quantize \
llama-quantize-stats \
llama-retrieval \
llama-save-load-state \
llama-server \
llama-simple \
llama-speculative \
llama-tokenize \
llama-train-text-from-scratch \
llama-vdot \
tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = \ TEST_TARGETS = \
@ -777,7 +813,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean: clean:
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf ggml-cuda/*.o rm -vrf ggml-cuda/*.o
rm -vrf ggml-cuda/template-instances/*.o rm -vrf ggml-cuda/template-instances/*.o
find examples pocs -type f -name "*.o" -delete find examples pocs -type f -name "*.o" -delete
@ -793,62 +829,62 @@ clean:
# Helper function that replaces .c, .cpp, and .cu file endings with .o: # Helper function that replaces .c, .cpp, and .cu file endings with .o:
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo @echo
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./llama-cli -h for help. ===='
@echo @echo
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@ -861,23 +897,23 @@ examples/server/%.hpp: examples/server/public/% Makefile
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@ ) > $@
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -888,55 +924,61 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS)
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -962,20 +1004,20 @@ build-info.o: common/build-info.cpp
tests: $(TEST_TARGETS) tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
run-benchmark-matmult: benchmark-matmult run-benchmark-matmult: llama-benchmark-matmult
./$@ ./$@
.PHONY: run-benchmark-matmult swift .PHONY: run-benchmark-matmult swift
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View file

@ -77,7 +77,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
*Notes:* *Notes:*
- **Memory** - **Memory**
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`. - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
@ -99,14 +99,14 @@ The docker build option is currently limited to *intel GPU* targets.
### Build image ### Build image
```sh ```sh
# Using FP16 # Using FP16
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
``` ```
*Notes*: *Notes*:
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command. To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative. You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
### Run container ### Run container
@ -275,7 +275,7 @@ source /opt/intel/oneapi/setvars.sh
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```sh ```sh
./build/bin/ls-sycl-device ./build/bin/llama-ls-sycl-device
``` ```
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
``` ```
@ -313,7 +313,7 @@ Examples:
- Use device 0: - Use device 0:
```sh ```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
``` ```
or run by script: or run by script:
@ -324,7 +324,7 @@ or run by script:
- Use multiple devices: - Use multiple devices:
```sh ```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
``` ```
Otherwise, you can run the script: Otherwise, you can run the script:
@ -427,7 +427,7 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
*Notes:* *Notes:*
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`. - By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
### III. Run the inference ### III. Run the inference
@ -488,13 +488,13 @@ Examples:
- Use device 0: - Use device 0:
``` ```
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0 build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
``` ```
- Use multiple devices: - Use multiple devices:
``` ```
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
``` ```
Otherwise, run the following wrapper script: Otherwise, run the following wrapper script:

View file

@ -10,6 +10,9 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
### Recent API changes ### Recent API changes
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
@ -53,7 +56,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
<li><a href="#quantization">Quantization</a></li> <li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li> <li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li> <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
<li><a href="#instruct-mode">Instruct mode</a></li>
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li> <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li> <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li> <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
@ -218,7 +220,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
Here is a typical run using LLaMA v2 13B on M2 Ultra: Here is a typical run using LLaMA v2 13B on M2 Ultra:
``` ```
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e $ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
I llama.cpp build info: I llama.cpp build info:
I UNAME_S: Darwin I UNAME_S: Darwin
I UNAME_P: arm I UNAME_P: arm
@ -556,7 +558,7 @@ Building the program with BLAS support may lead to some performance improvements
```sh ```sh
# Build the image # Build the image
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile . docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
# Then, use it: # Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
@ -577,7 +579,9 @@ Building the program with BLAS support may lead to some performance improvements
vulkaninfo vulkaninfo
``` ```
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead. Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Then, build llama.cpp using the cmake command below: Then, build llama.cpp using the cmake command below:
@ -585,7 +589,7 @@ Building the program with BLAS support may lead to some performance improvements
cmake -B build -DLLAMA_VULKAN=1 cmake -B build -DLLAMA_VULKAN=1
cmake --build build --config Release cmake --build build --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU) # Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example: # You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@ -622,17 +626,17 @@ python3 convert-hf-to-gguf.py models/mymodel/
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method) # quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M ./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
# update the gguf filetype to current version if older version is now unsupported # update the gguf filetype to current version if older version is now unsupported
./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY ./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
``` ```
### Run the quantized model ### Run the quantized model
```bash ```bash
# start inference on a gguf model # start inference on a gguf model
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 ./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
``` ```
When running the larger models, make sure you have enough disk space to store all the intermediate files. When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -707,7 +711,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
#### How to run #### How to run
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip 1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` 2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output: 3. Output:
``` ```
perplexity : calculating perplexity over 655 chunks perplexity : calculating perplexity over 655 chunks
@ -731,16 +735,16 @@ Here is an example of a few-shot interaction, invoked with the command
./examples/chat-13B.sh ./examples/chat-13B.sh
# custom arguments using a 13B model # custom arguments using a 13B model
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
``` ```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program. Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction ### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
```bash ```bash
# Start a new chat # Start a new chat
@ -762,41 +766,13 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: `llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
```bash ```bash
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' ./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
``` ```
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruct mode
1. First, download and place the `ggml` model into the `./models` folder
2. Run the `main` tool like this:
```
./examples/alpaca.sh
```
Sample run:
```
== Running in interactive mode. ==
- Press Ctrl+C to interject at any time.
- Press Return to return control to LLaMA.
- If you want to submit another line, end your input in '\'.
Below is an instruction that describes a task. Write a response that appropriately completes the request.
> How many letters are there in the English alphabet?
There 26 letters in the English Alphabet
> What is the most common way of transportation in Amsterdam?
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
> List 5 words that start with "ca".
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```
### Obtaining and using the Facebook LLaMA 2 model ### Obtaining and using the Facebook LLaMA 2 model
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data. - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
@ -869,7 +845,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
Now, you can start chatting: Now, you can start chatting:
``` ```
$cd /data/data/com.termux/files/home/bin $cd /data/data/com.termux/files/home/bin
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml $./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
``` ```
Here's a demo of an interactive session running on Pixel 5 phone: Here's a demo of an interactive session running on Pixel 5 phone:
@ -936,8 +912,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
```bash ```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile . docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
``` ```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -987,7 +963,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
### Docs ### Docs
- [main](./examples/main/README.md) - [main (cli)](./examples/main/README.md)
- [server](./examples/server/README.md) - [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md) - [jeopardy](./examples/jeopardy/README.md)
- [BLIS](./docs/BLIS.md) - [BLIS](./docs/BLIS.md)

224
ci/run.sh
View file

@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
wiki_test="${path_wiki}/wiki.test.raw" wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0 ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0 ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1 ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0 ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1 ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -437,45 +437,45 @@ function gg_run_pythia_1_4b {
wiki_test_60="${path_wiki}/wiki.test-60.raw" wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0 ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0 ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1 ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0 ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1 ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -569,47 +569,47 @@ function gg_run_pythia_2_8b {
wiki_test="${path_wiki}/wiki.test.raw" wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0 ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0 ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1 ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0 ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1 ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -693,10 +693,10 @@ function gg_run_embd_bge_small {
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/quantize ${model_f16} ${model_q8_0} q8_0 ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e set +e
} }

View file

@ -200,19 +200,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
} }
params.hf_file = params.model; params.hf_file = params.model;
} else if (params.model.empty()) { } else if (params.model.empty()) {
std::string cache_directory = fs_get_cache_directory(); params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
params.model = cache_directory + string_split(params.hf_file, '/').back();
} }
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
if (params.model.empty()) { if (params.model.empty()) {
auto f = string_split(params.model_url, '#').front(); auto f = string_split(params.model_url, '#').front();
f = string_split(f, '?').front(); f = string_split(f, '?').front();
f = string_split(f, '/').back(); params.model = fs_get_cache_file(string_split(f, '/').back());
params.model = "models/" + f;
} }
} else if (params.model.empty()) { } else if (params.model.empty()) {
params.model = DEFAULT_MODEL_PATH; params.model = DEFAULT_MODEL_PATH;
@ -2279,6 +2273,16 @@ std::string fs_get_cache_directory() {
return ensure_trailing_slash(cache_directory); return ensure_trailing_slash(cache_directory);
} }
std::string fs_get_cache_file(const std::string & filename) {
GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
std::string cache_directory = fs_get_cache_directory();
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
return cache_directory + filename;
}
// //
// Model utils // Model utils

View file

@ -277,6 +277,7 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path); bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory(); std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
// //
// Model utils // Model utils

View file

@ -263,7 +263,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
throw std::runtime_error("At least one of min_value or max_value must be set"); throw std::runtime_error("At least one of min_value or max_value must be set");
} }
const std::string SPACE_RULE = "\" \"?"; const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
struct BuiltinRule { struct BuiltinRule {
std::string content; std::string content;
@ -280,7 +280,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}}, {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}}, {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}}, {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}}, {"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}}, {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}}, {"null", {"\"null\" space", {}}},
}; };

View file

@ -47,11 +47,12 @@ class Model:
_model_classes: dict[str, type[Model]] = {} _model_classes: dict[str, type[Model]] = {}
dir_model: Path dir_model: Path
ftype: int ftype: gguf.LlamaFileType
is_big_endian: bool is_big_endian: bool
endianess: gguf.GGUFEndian endianess: gguf.GGUFEndian
use_temp_file: bool use_temp_file: bool
lazy: bool lazy: bool
model_name: str | None
part_names: list[str] part_names: list[str]
is_safetensors: bool is_safetensors: bool
hparams: dict[str, Any] hparams: dict[str, Any]
@ -64,7 +65,7 @@ class Model:
# subclasses should define this! # subclasses should define this!
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool): def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
if type(self) is Model: if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model self.dir_model = dir_model
@ -73,10 +74,11 @@ class Model:
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file self.use_temp_file = use_temp_file
self.lazy = not eager self.lazy = not eager
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors") self.model_name = model_name
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
self.is_safetensors = len(self.part_names) > 0 self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors: if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, ".bin") self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@ -94,7 +96,7 @@ class Model:
ftype_lw: str = ftype_up.lower() ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype # allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
@classmethod @classmethod
def __init_subclass__(cls): def __init_subclass__(cls):
@ -182,7 +184,7 @@ class Model:
return new_name return new_name
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_block_count(self.block_count)
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@ -324,21 +326,21 @@ class Model:
def write(self): def write(self):
self.write_tensors() self.write_tensors()
self.gguf_writer.write_header_to_file() self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.write_tensors_to_file(progress=True)
self.gguf_writer.close() self.gguf_writer.close()
def write_vocab(self): def write_vocab(self):
self.gguf_writer.write_header_to_file() self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close() self.gguf_writer.close()
@staticmethod @staticmethod
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]: def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
part_names: list[str] = [] part_names: list[str] = []
for filename in os.listdir(dir_model): for filename in os.listdir(dir_model):
if filename.endswith(suffix): if filename.startswith(prefix) and filename.endswith(suffix):
part_names.append(filename) part_names.append(filename)
part_names.sort() part_names.sort()
@ -665,7 +667,7 @@ class GPTNeoXModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"] block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
@ -798,7 +800,7 @@ class MPTModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["n_layers"] block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
@ -850,7 +852,7 @@ class OrionModel(Model):
raise ValueError("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_context_length(ctx_length)
@ -887,7 +889,7 @@ class BaichuanModel(Model):
else: else:
raise ValueError("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_context_length(ctx_length)
@ -1010,7 +1012,7 @@ class XverseModel(Model):
else: else:
raise ValueError("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_context_length(ctx_length)
@ -1206,7 +1208,7 @@ class StableLMModel(Model):
hparams = self.hparams hparams = self.hparams
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
@ -1681,7 +1683,7 @@ class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2 model_arch = gguf.MODEL_ARCH.GPT2
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_ctx"]) self.gguf_writer.add_context_length(self.hparams["n_ctx"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -2248,7 +2250,7 @@ class GemmaModel(Model):
hparams = self.hparams hparams = self.hparams
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
@ -2348,7 +2350,7 @@ class MambaModel(Model):
# Fail early for models which don't have a block expansion factor of 2 # Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model assert d_inner == 2 * d_model
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@ -2852,7 +2854,7 @@ def main() -> None:
logger.error(f"Model {hparams['architectures'][0]} is not supported") logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1) sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
logger.info("Set model parameters") logger.info("Set model parameters")
model_instance.set_gguf_parameters() model_instance.set_gguf_parameters()

View file

@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback). Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
## GGUF specification ## GGUF specification

View file

@ -3,7 +3,7 @@
## Verifying that the model is running on the GPU with CUDA ## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell ```shell
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
``` ```
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@ -27,7 +27,7 @@ RAM: 32GB
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML) Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Result: Result:

View file

@ -13,42 +13,43 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN) if (EMSCRIPTEN)
else() else()
add_subdirectory(baby-llama) add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench) add_subdirectory(batched-bench)
add_subdirectory(batched)
add_subdirectory(benchmark) add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(eval-callback) add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(finetune) add_subdirectory(finetune)
add_subdirectory(gritlm) add_subdirectory(gbnf-validator)
add_subdirectory(gguf-split) add_subdirectory(gguf-split)
add_subdirectory(gguf)
add_subdirectory(gritlm)
add_subdirectory(imatrix)
add_subdirectory(infill) add_subdirectory(infill)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(llava) add_subdirectory(llava)
if (LLAMA_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(retrieval)
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(passkey)
add_subdirectory(speculative)
add_subdirectory(lookahead) add_subdirectory(lookahead)
add_subdirectory(lookup) add_subdirectory(lookup)
add_subdirectory(gguf) add_subdirectory(main)
add_subdirectory(train-text-from-scratch) add_subdirectory(parallel)
add_subdirectory(imatrix) add_subdirectory(passkey)
if (LLAMA_BUILD_SERVER) add_subdirectory(perplexity)
add_subdirectory(server) add_subdirectory(quantize-stats)
endif() add_subdirectory(quantize)
add_subdirectory(export-lora) add_subdirectory(retrieval)
if (LLAMA_RPC) if (LLAMA_RPC)
add_subdirectory(rpc) add_subdirectory(rpc)
endif() endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
if (LLAMA_SYCL)
add_subdirectory(sycl)
endif()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(tokenize)
add_subdirectory(train-text-from-scratch)
endif() endif()

View file

@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD") GEN_OPTIONS+=(--threads "$N_THREAD")
fi fi
./main "${GEN_OPTIONS[@]}" \ ./llama-cli "${GEN_OPTIONS[@]}" \
--model "$MODEL" \ --model "$MODEL" \
--in-prefix " " \ --in-prefix " " \
--in-suffix "${AI_NAME}:" \ --in-suffix "${AI_NAME}:" \

View file

@ -1,19 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 7

View file

@ -1,4 +1,4 @@
set(TARGET baby-llama) set(TARGET llama-baby-llama)
add_executable(${TARGET} baby-llama.cpp) add_executable(${TARGET} baby-llama.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -58,4 +58,4 @@ echo "$2
model=$1 model=$1
# generate the most likely continuation until the string "===" is found # generate the most likely continuation until the string "===" is found
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs ./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View file

@ -1,4 +1,4 @@
set(TARGET batched-bench) set(TARGET llama-batched-bench)
add_executable(${TARGET} batched-bench.cpp) add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash ```bash
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] ./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 ./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps ./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
# custom set of batches # custom set of batches
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 ./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
``` ```
## Sample results ## Sample results

View file

@ -1,6 +1,6 @@
.PHONY: build .PHONY: build
build: build:
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
rm -f ./batched_swift rm -f ./llama-batched-swift
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift

View file

@ -4,7 +4,7 @@
import PackageDescription import PackageDescription
let package = Package( let package = Package(
name: "batched_swift", name: "llama-batched-swift",
platforms: [.macOS(.v12)], platforms: [.macOS(.v12)],
dependencies: [ dependencies: [
.package(name: "llama", path: "../../"), .package(name: "llama", path: "../../"),
@ -13,7 +13,7 @@ let package = Package(
// Targets are the basic building blocks of a package, defining a module or a test suite. // Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies. // Targets can depend on other targets in this package and products from dependencies.
.executableTarget( .executableTarget(
name: "batched_swift", name: "llama-batched-swift",
dependencies: ["llama"], dependencies: ["llama"],
path: "Sources", path: "Sources",
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")] linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]

View file

@ -1,4 +1,4 @@
This is a swift clone of `examples/batched`. This is a swift clone of `examples/batched`.
$ `make` $ `make`
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]` $ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`

View file

@ -1,4 +1,4 @@
set(TARGET batched) set(TARGET llama-batched)
add_executable(${TARGET} batched.cpp) add_executable(${TARGET} batched.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -3,7 +3,7 @@
The example demonstrates batched generation from a given prompt The example demonstrates batched generation from a given prompt
```bash ```bash
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 ./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
... ...

View file

@ -1,4 +1,4 @@
set(TARGET benchmark) set(TARGET llama-bench-matmult)
add_executable(${TARGET} benchmark-matmult.cpp) add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})

View file

@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE $PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./main $GEN_OPTIONS \ ./llama-cli $GEN_OPTIONS \
--model "$MODEL" \ --model "$MODEL" \
--threads "$N_THREAD" \ --threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \ --n_predict "$N_PREDICTS" \

View file

@ -62,7 +62,7 @@ fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...' echo 'Prompt cache does not exist, building...'
# Default batch_size to 64 here for better user feedback during initial prompt processing # Default batch_size to 64 here for better user feedback during initial prompt processing
./main 2>>"$LOG" \ ./llama-cli 2>>"$LOG" \
--batch_size 64 \ --batch_size 64 \
"${OPTS[@]}" \ "${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \ --prompt-cache "$PROMPT_CACHE_FILE" \
@ -109,13 +109,13 @@ while read -e line; do
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE" printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
./main 2>>"$LOG" "${OPTS[@]}" \ ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
--prompt-cache "$CUR_PROMPT_CACHE" \ --prompt-cache "$CUR_PROMPT_CACHE" \
--prompt-cache-all \ --prompt-cache-all \
--file "$CUR_PROMPT_FILE" \ --file "$CUR_PROMPT_FILE" \
--reverse-prompt "${USER_NAME}:" \ --reverse-prompt "${USER_NAME}:" \
--n_predict "$n_predict" | --n_predict "$n_predict" |
skip_bytes 1 | # skip BOS token added by ./main skip_bytes 1 | # skip BOS token added by ./llama-cli
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
skip_bytes "$n_prompt_len_pre" # print generation skip_bytes "$n_prompt_len_pre" # print generation
@ -133,7 +133,7 @@ while read -e line; do
# TODO get both messages in one go # TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./main output!" echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
exit 1 exit 1
fi fi
@ -144,7 +144,7 @@ while read -e line; do
fi fi
# Update cache for next prompt in background, ideally during user input # Update cache for next prompt in background, ideally during user input
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \ ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
--prompt-cache "$NEXT_PROMPT_CACHE" \ --prompt-cache "$NEXT_PROMPT_CACHE" \
--file "$NEXT_PROMPT_FILE" \ --file "$NEXT_PROMPT_FILE" \
--n_predict 1 & --n_predict 1 &

View file

@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE $PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/main $GEN_OPTIONS \ ./bin/llama-cli $GEN_OPTIONS \
--model "$MODEL" \ --model "$MODEL" \
--threads "$N_THREAD" \ --threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \ --n_predict "$N_PREDICTS" \

View file

@ -11,6 +11,6 @@ cd ..
# #
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt # "--keep 48" is based on the contents of prompts/chat-with-bob.txt
# #
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \ ./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \ --repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt -r "User:" -f prompts/chat-with-bob.txt

View file

@ -1,4 +1,4 @@
set(TARGET convert-llama2c-to-ggml) set(TARGET llama-convert-llama2c-to-ggml)
add_executable(${TARGET} convert-llama2c-to-ggml.cpp) add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
After successful compilation, following usage options are available: After successful compilation, following usage options are available:
``` ```
usage: ./convert-llama2c-to-ggml [options] usage: ./llama-convert-llama2c-to-ggml [options]
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
@ -19,10 +19,10 @@ options:
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` `$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K). Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
Now you can use the model with a command like: Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` `$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View file

@ -1,4 +1,4 @@
set(TARGET embedding) set(TARGET llama-embedding)
add_executable(${TARGET} embedding.cpp) add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
### Unix-based systems (Linux, macOS, etc.): ### Unix-based systems (Linux, macOS, etc.):
```bash ```bash
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null ./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
``` ```
### Windows: ### Windows:
```powershell ```powershell
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
``` ```
The above command will output space-separated float values. The above command will output space-separated float values.

View file

@ -1,9 +1,9 @@
set(TARGET eval-callback) set(TARGET llama-eval-callback)
add_executable(${TARGET} eval-callback.cpp) add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback) set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View file

@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
Usage: Usage:
```shell ```shell
eval-callback \ llama-eval-callback \
--hf-repo ggml-org/models \ --hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \ --hf-file phi-2/ggml-model-q4_0.gguf \
--model phi-2-q4_0.gguf \ --model phi-2-q4_0.gguf \

View file

@ -1,4 +1,4 @@
set(TARGET export-lora) set(TARGET llama-export-lora)
add_executable(${TARGET} export-lora.cpp) add_executable(${TARGET} export-lora.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -3,7 +3,7 @@
Apply LORA adapters to base model and export the resulting model. Apply LORA adapters to base model and export the resulting model.
``` ```
usage: export-lora [options] usage: llama-export-lora [options]
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
@ -17,7 +17,7 @@ options:
For example: For example:
```bash ```bash
./bin/export-lora \ ./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \ -m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin

View file

@ -1,4 +1,4 @@
set(TARGET finetune) set(TARGET llama-finetune)
add_executable(${TARGET} finetune.cpp) add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -7,7 +7,7 @@ Basic usage instructions:
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# finetune LORA adapter # finetune LORA adapter
./bin/finetune \ ./bin/llama-finetune \
--model-base open-llama-3b-v2-q8_0.gguf \ --model-base open-llama-3b-v2-q8_0.gguf \
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \ --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \ --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
--use-checkpointing --use-checkpointing
# predict # predict
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin ./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
``` ```
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@ -38,14 +38,14 @@ After 10 more iterations:
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter. Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`. llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above. These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
In `main` you can also load multiple LORA adapters, which will then be mixed together. In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
```bash ```bash
./bin/main -m open-llama-3b-v2-q8_0.gguf \ ./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
``` ```
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
```bash ```bash
./bin/main -m open-llama-3b-v2-q8_0.gguf \ ./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin

View file

@ -2,7 +2,7 @@
cd `dirname $0` cd `dirname $0`
cd ../.. cd ../..
EXE="./finetune" EXE="./llama-finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi

View file

@ -1,5 +1,5 @@
set(TARGET gbnf-validator) set(TARGET llama-gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp) add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -7,6 +7,8 @@
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <sstream>
#include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
@ -69,13 +71,14 @@ int main(int argc, char** argv) {
return 1; return 1;
} }
fseek(grammar_file, 0, SEEK_END); std::string grammar_str;
size_t grammar_size = ftell(grammar_file); {
fseek(grammar_file, 0, SEEK_SET); std::ifstream grammar_file(grammar_filename);
GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
std::string grammar_str(grammar_size, ' '); std::stringstream buffer;
fread(&grammar_str[0], 1, grammar_size, grammar_file); buffer << grammar_file.rdbuf();
fclose(grammar_file); grammar_str = buffer.str();
}
// Parse the GBNF grammar // Parse the GBNF grammar
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
@ -100,20 +103,15 @@ int main(int argc, char** argv) {
grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
// Read the input file // Read the input file
FILE* input_file = fopen(input_filename.c_str(), "r"); std::string input_str;
if (!input_file) { {
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str()); std::ifstream input_file(input_filename);
return 1; GGML_ASSERT(input_file.is_open() && "Failed to open input file");
std::stringstream buffer;
buffer << input_file.rdbuf();
input_str = buffer.str();
} }
fseek(input_file, 0, SEEK_END);
size_t input_size = ftell(input_file);
fseek(input_file, 0, SEEK_SET);
std::string input_str(input_size, ' ');
fread(&input_str[0], 1, input_size, input_file);
fclose(input_file);
// Validate the input string against the grammar // Validate the input string against the grammar
size_t error_pos; size_t error_pos;
std::string error_msg; std::string error_msg;

View file

@ -1,4 +1,4 @@
set(TARGET gguf-split) set(TARGET llama-gguf-split)
add_executable(${TARGET} gguf-split.cpp) add_executable(${TARGET} gguf-split.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -18,8 +18,8 @@ fi
set -x set -x
SPLIT=$1/gguf-split SPLIT=$1/llama-gguf-split
MAIN=$1/main MAIN=$1/llama-cli
WORK_PATH=$TMP_DIR/gguf-split WORK_PATH=$TMP_DIR/gguf-split
ROOT_DIR=$(realpath $(dirname $0)/../../) ROOT_DIR=$(realpath $(dirname $0)/../../)

View file

@ -1,4 +1,4 @@
set(TARGET gguf) set(TARGET llama-gguf)
add_executable(${TARGET} gguf.cpp) add_executable(${TARGET} gguf.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})

View file

@ -1,15 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 -n -1 \
--repeat_last_n 64 --repeat_penalty 1.3 \
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

View file

@ -1,4 +1,4 @@
set(TARGET gritlm) set(TARGET llama-gritlm)
add_executable(${TARGET} gritlm.cpp) add_executable(${TARGET} gritlm.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
Run the example using the downloaded model: Run the example using the downloaded model:
```console ```console
$ ./gritlm -m models/gritlm-7b_q4_1.gguf $ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103

View file

@ -1,4 +1,4 @@
set(TARGET imatrix) set(TARGET llama-imatrix)
add_executable(${TARGET} imatrix.cpp) add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
## Usage ## Usage
``` ```
./imatrix \ ./llama-imatrix \
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
LLAMA_CUDA=1 make -j LLAMA_CUDA=1 make -j
# generate importance matrix (imatrix.dat) # generate importance matrix (imatrix.dat)
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
# use the imatrix to perform a Q4_K_M quantization # use the imatrix to perform a Q4_K_M quantization
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m ./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
``` ```

View file

@ -218,20 +218,64 @@ void IMatrixCollector::save_imatrix(int ncall) const {
fname += std::to_string(ncall); fname += std::to_string(ncall);
} }
// avoid writing imatrix entries that do not have full data
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
int n_entries = 0;
std::vector<std::string> to_store;
bool is_first = true; // for printing
for (const auto & kv : m_stats) {
const int n_all = kv.second.counts.size();
if (n_all == 0) {
continue;
}
int n_zeros = 0;
for (const int c : kv.second.counts) {
if (c == 0) {
n_zeros++;
}
}
if (n_zeros != 0 && is_first) {
fprintf(stderr, "\n");
is_first = false;
}
if (n_zeros == n_all) {
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
continue;
}
if (n_zeros > 0) {
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue;
}
n_entries++;
to_store.push_back(kv.first);
}
if (to_store.size() < m_stats.size()) {
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
}
std::ofstream out(fname, std::ios::binary); std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char *) &n_entries, sizeof(n_entries)); out.write((const char *) &n_entries, sizeof(n_entries));
for (const auto & p : m_stats) { for (const auto & name : to_store) {
int len = p.first.size(); const auto & stat = m_stats.at(name);
int len = name.size();
out.write((const char *) &len, sizeof(len)); out.write((const char *) &len, sizeof(len));
out.write(p.first.c_str(), len); out.write(name.c_str(), len);
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall)); out.write((const char *) &stat.ncall, sizeof(stat.ncall));
int nval = p.second.values.size(); int nval = stat.values.size();
out.write((const char *) &nval, sizeof(nval)); out.write((const char *) &nval, sizeof(nval));
if (nval > 0) { if (nval > 0) {
std::vector<float> tmp(nval); std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) { for (int i = 0; i < nval; i++) {
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall); tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
} }
out.write((const char*)tmp.data(), nval*sizeof(float)); out.write((const char*)tmp.data(), nval*sizeof(float));
} }

View file

@ -1,4 +1,4 @@
set(TARGET infill) set(TARGET llama-infill)
add_executable(${TARGET} infill.cpp) add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu
``` ```
```bash ```bash
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " ./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
``` ```

View file

@ -21,7 +21,7 @@ counter=1
echo 'Running' echo 'Running'
while IFS= read -r question while IFS= read -r question
do do
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\"" exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
echo $counter echo $counter
echo "Current Question: $question" echo "Current Question: $question"
eval "$exe_cmd" eval "$exe_cmd"

View file

@ -1,5 +1,5 @@
# Usage: # Usage:
#! ./server -m some-model.gguf & #! ./llama-server -m some-model.gguf &
#! pip install pydantic #! pip install pydantic
#! python json-schema-pydantic-example.py #! python json-schema-pydantic-example.py

View file

@ -193,9 +193,8 @@ class BuiltinRule:
self.content = content self.content = content
self.deps = deps or [] self.deps = deps or []
# whitespace is constrained to a single space char to prevent model "running away" in # Constraining spaces to prevent model "running away".
# whitespace. Also maybe improves generation quality? SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
SPACE_RULE = '" "?'
PRIMITIVE_RULES = { PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []), 'boolean' : BuiltinRule('("true" | "false") space', []),
@ -207,7 +206,7 @@ PRIMITIVE_RULES = {
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []), 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []), 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []), 'null' : BuiltinRule('"null" space', []),
} }
@ -706,7 +705,7 @@ class SchemaConverter:
def main(args_in = None): def main(args_in = None):
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description=''' description='''
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
given JSON schema. Only a subset of JSON schema features are supported; more may be given JSON schema. Only a subset of JSON schema features are supported; more may be
added in the future. added in the future.
''', ''',

View file

@ -1,4 +1,4 @@
# llama.cpp/example/llama-bench # llama.cpp/examples/llama-bench
Performance testing tool for llama.cpp. Performance testing tool for llama.cpp.

View file

@ -1033,6 +1033,27 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") { if (field == "n_gpu_layers") {
return 3; return 3;
} }
if (field == "n_threads") {
return 7;
}
if (field == "n_batch") {
return 7;
}
if (field == "n_ubatch") {
return 8;
}
if (field == "type_k" || field == "type_v") {
return 6;
}
if (field == "split_mode") {
return 5;
}
if (field == "flash_attn") {
return 2;
}
if (field == "use_mmap") {
return 4;
}
if (field == "test") { if (field == "test") {
return 13; return 13;
} }

View file

@ -1,18 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

View file

@ -1,18 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

View file

@ -30,8 +30,9 @@ if(TARGET BUILD_INFO)
add_dependencies(llava BUILD_INFO) add_dependencies(llava BUILD_INFO)
endif() endif()
set(TARGET llava-cli) set(TARGET llama-llava-cli)
add_executable(llava-cli llava-cli.cpp) add_executable(${TARGET} llava-cli.cpp)
install(TARGETS llava-cli RUNTIME) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) install(TARGETS ${TARGET} RUNTIME)
target_compile_features(llava PRIVATE cxx_std_11) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown. Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
## Usage ## Usage
Build with cmake or run `make llava-cli` to build it. Build with cmake or run `make llama-llava-cli` to build it.
After building, run: `./llava-cli` to see the usage. For example: After building, run: `./llama-llava-cli` to see the usage. For example:
```sh ```sh
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ ./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
--image path/to/an/image.jpg \ --image path/to/an/image.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@ -62,7 +62,7 @@ python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
```sh ```sh
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s ./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
``` ```
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
### case 1 ### case 1
**input** **input**
```sh ```sh
/data/local/tmp/llava-cli \ /data/local/tmp/llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \ -m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \ -t 4 \
@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
### case 2 ### case 2
**input** **input**
```sh ```sh
/data/local/tmp/llava-cli \ /data/local/tmp/llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \ -m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \ -t 4 \
@ -126,7 +126,7 @@ llama_print_timings: total time = 34570.79 ms
#### llava-cli release-b2005 #### llava-cli release-b2005
**input** **input**
```sh ```sh
/data/local/tmp/llava-cli \ /data/local/tmp/llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \ -m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \ -t 4 \
@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
### case 1 ### case 1
**input** **input**
```sh ```sh
./llava-cli \ ./llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \ -m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \
--image /data/local/tmp/demo.jpeg \ --image /data/local/tmp/demo.jpeg \
@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
### case 2 ### case 2
**input** **input**
```sh ```sh
./llava-cli \ ./llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \ -m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \ --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \ -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \

View file

@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
After API is confirmed, more models will be supported / uploaded. After API is confirmed, more models will be supported / uploaded.
## Usage ## Usage
Build with cmake or run `make llava-cli` to build it. Build with cmake or run `make llama-llava-cli` to build it.
After building, run: `./llava-cli` to see the usage. For example: After building, run: `./llama-llava-cli` to see the usage. For example:
```sh ```sh
./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg ./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
``` ```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@ -95,9 +95,9 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
``` ```
7) And finally we can run the llava-cli using the 1.6 model version: 7) And finally we can run the llava cli using the 1.6 model version:
```console ```console
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
``` ```
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)

View file

@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
program_dir="build_64/bin" program_dir="build_64/bin"
binName="llava-cli" binName="llama-llava-cli"
n_threads=4 n_threads=4

View file

@ -1,4 +1,4 @@
set(TARGET lookahead) set(TARGET llama-lookahead)
add_executable(${TARGET} lookahead.cpp) add_executable(${TARGET} lookahead.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -1,22 +1,22 @@
set(TARGET lookup) set(TARGET llama-lookup)
add_executable(${TARGET} lookup.cpp) add_executable(${TARGET} lookup.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TARGET lookup-create) set(TARGET llama-lookup-create)
add_executable(${TARGET} lookup-create.cpp) add_executable(${TARGET} lookup-create.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TARGET lookup-merge) set(TARGET llama-lookup-merge)
add_executable(${TARGET} lookup-merge.cpp) add_executable(${TARGET} lookup-merge.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TARGET lookup-stats) set(TARGET llama-lookup-stats)
add_executable(${TARGET} lookup-stats.cpp) add_executable(${TARGET} lookup-stats.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -11,14 +11,14 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
static void print_usage() { static void print_usage(char* argv0) {
fprintf(stderr, "Merges multiple lookup cache files into a single one.\n"); fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n"); fprintf(stderr, "Usage: %s [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n", argv0);
} }
int main(int argc, char ** argv){ int main(int argc, char ** argv){
if (argc < 3) { if (argc < 3) {
print_usage(); print_usage(argv[0]);
exit(1); exit(1);
} }
@ -27,7 +27,7 @@ int main(int argc, char ** argv){
for (int i = 0; i < argc-1; ++i) { for (int i = 0; i < argc-1; ++i) {
args[i] = argv[i+1]; args[i] = argv[i+1];
if (args[i] == "-h" || args[i] == "--help") { if (args[i] == "-h" || args[i] == "--help") {
print_usage(); print_usage(argv[0]);
exit(0); exit(0);
} }
} }

View file

@ -1,12 +1,12 @@
cmake_minimum_required(VERSION 3.12) cmake_minimum_required(VERSION 3.12)
project("main-cmake-pkg" C CXX) project("llama-cli-cmake-pkg" C CXX)
set(TARGET main-cmake-pkg) set(TARGET llama-cli-cmake-pkg)
find_package(Llama 0.0.1 REQUIRED) find_package(Llama 0.0.1 REQUIRED)
# Bake common functionality in with target. Because applications # Bake common functionality in with target. Because applications
# using the relocatable Llama package should be outside of the # using the relocatable Llama package should be outside of the
# source tree, main-cmake-pkg pretends the dependencies are built-in. # source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common") set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
add_library(common OBJECT) add_library(common OBJECT)
file(GLOB _common_files file(GLOB _common_files
@ -15,7 +15,7 @@ file(GLOB _common_files
) )
target_sources(common PRIVATE ${_common_files}) target_sources(common PRIVATE ${_common_files})
# If the common project was part of "main-cmake-pkg" the transient # If the common project was part of "llama-cli-cmake-pkg" the transient
# defines would automatically be attached. Because the common func- # defines would automatically be attached. Because the common func-
# tionality is separate, but dependent upon the defines, it must be # tionality is separate, but dependent upon the defines, it must be
# explicitly extracted from the "llama" target. # explicitly extracted from the "llama" target.

View file

@ -1,6 +1,6 @@
# llama.cpp/example/main-cmake-pkg # llama.cpp/example/main-cmake-pkg
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
## Building ## Building
@ -20,7 +20,7 @@ cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP cmake --install build --prefix C:/LlamaCPP
``` ```
### Build main-cmake-pkg ### Build llama-cli-cmake-pkg
```cmd ```cmd

View file

@ -1,4 +1,4 @@
set(TARGET main) set(TARGET llama-cli)
add_executable(${TARGET} main.cpp) add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

Some files were not shown because too many files have changed in this diff Show more