Merge branch 'master' into convert-split
This commit is contained in:
commit
4e4e376e1e
194 changed files with 10214 additions and 7749 deletions
|
@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
||||||
stage('Running llama.cpp'){
|
stage('Running llama.cpp'){
|
||||||
sh'''#!/bin/bash
|
sh'''#!/bin/bash
|
||||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||||
cat llama_log.txt # Printing results
|
cat llama_log.txt # Printing results
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,13 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV LLAMA_CUDA=1
|
||||||
|
|
||||||
RUN make -j$(nproc) main
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
26
.devops/llama-cli-intel.Dockerfile
Normal file
26
.devops/llama-cli-intel.Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
RUN make -j$(nproc) main
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/main" ]
|
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
||||||
cmake --build build --config Release --target main
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/main /main && \
|
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -9,15 +9,15 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN make -j$(nproc) main
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llamaclblast
|
%{_bindir}/llama-clblast-cli
|
||||||
%{_bindir}/llamaclblastserver
|
%{_bindir}/llama-clblast-server
|
||||||
%{_bindir}/llamaclblastsimple
|
%{_bindir}/llama-clblast-simple
|
||||||
/usr/lib/systemd/system/llamaclblast.service
|
/usr/lib/systemd/system/llamaclblast.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llamacppcuda
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llamacppcuda
|
%{_bindir}/llama-cuda-cli
|
||||||
%{_bindir}/llamacppcudaserver
|
%{_bindir}/llama-cuda-server
|
||||||
%{_bindir}/llamacppcudasimple
|
%{_bindir}/llama-cuda-simple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacuda.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -38,9 +38,9 @@ make -j
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llama
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamaserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||||
|
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -69,9 +69,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama
|
%{_bindir}/llama-cli
|
||||||
%{_bindir}/llamaserver
|
%{_bindir}/llama-server
|
||||||
%{_bindir}/llamasimple
|
%{_bindir}/llama-simple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
29
.devops/llama-server-intel.Dockerfile
Normal file
29
.devops/llama-server-intel.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/server" ]
|
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -19,13 +19,13 @@ RUN apt-get update && \
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
cmake --build build --config Release --target server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/server /server && \
|
RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -11,15 +11,15 @@ COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,34 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "LLAMA_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target main
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/main /main
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
|
|
@ -6,11 +6,11 @@
|
||||||
let
|
let
|
||||||
inherit (config.packages) default;
|
inherit (config.packages) default;
|
||||||
binaries = [
|
binaries = [
|
||||||
"llama"
|
"llama-cli"
|
||||||
"llama-embedding"
|
"llama-embedding"
|
||||||
"llama-server"
|
"llama-server"
|
||||||
"quantize"
|
"llama-quantize"
|
||||||
"train-text-from-scratch"
|
"llama-train-text-from-scratch"
|
||||||
];
|
];
|
||||||
mkApp = name: {
|
mkApp = name: {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
|
|
@ -243,8 +243,6 @@ effectiveStdenv.mkDerivation (
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
|
|
||||||
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
|
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/llama.h $out/include/
|
cp $src/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
@ -294,7 +292,7 @@ effectiveStdenv.mkDerivation (
|
||||||
license = lib.licenses.mit;
|
license = lib.licenses.mit;
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
mainProgram = "llama";
|
mainProgram = "llama-cli";
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
|
|
|
@ -1,45 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "LLAMA_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target server
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/server /server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
|
|
@ -10,11 +10,11 @@ shift
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
python3 ./convert-hf-to-gguf.py "$@"
|
python3 ./convert-hf-to-gguf.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./quantize "$@"
|
./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./main "$@"
|
./llama-cli "$@"
|
||||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||||
./finetune "$@"
|
./llama-finetune "$@"
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
./server "$@"
|
./llama-server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
|
|
|
@ -12,8 +12,8 @@ build*/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
/main
|
/llama-cli
|
||||||
/quantize
|
/llama-quantize
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
|
@ -26,3 +26,6 @@ indent_size = 2
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
|
[examples/cvector-generator/*.txt]
|
||||||
|
insert_final_newline = unset
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
2
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./main --version
|
$./llama-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
2
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./main --version
|
$./llama-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
2
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./main --version
|
$./llama-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
2
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./main --version
|
$./llama-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
5
.github/pull_request_template.md
vendored
Normal file
5
.github/pull_request_template.md
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
- Self Reported Review Complexity:
|
||||||
|
- [ ] Review Complexity : Low
|
||||||
|
- [ ] Review Complexity : Medium
|
||||||
|
- [ ] Review Complexity : High
|
||||||
|
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
2
.github/workflows/bench.yml
vendored
2
.github/workflows/bench.yml
vendored
|
@ -119,7 +119,7 @@ jobs:
|
||||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
-DLLAMA_FATAL_WARNINGS=OFF \
|
||||||
-DLLAMA_ALL_WARNINGS=OFF \
|
-DLLAMA_ALL_WARNINGS=OFF \
|
||||||
-DCMAKE_BUILD_TYPE=Release;
|
-DCMAKE_BUILD_TYPE=Release;
|
||||||
cmake --build build --config Release -j $(nproc) --target server
|
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Download the dataset
|
- name: Download the dataset
|
||||||
id: download_dataset
|
id: download_dataset
|
||||||
|
|
21
.github/workflows/build.yml
vendored
21
.github/workflows/build.yml
vendored
|
@ -13,7 +13,7 @@ on:
|
||||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
@ -84,7 +84,7 @@ jobs:
|
||||||
name: llama-bin-macos-arm64.zip
|
name: llama-bin-macos-arm64.zip
|
||||||
|
|
||||||
macOS-latest-cmake-x64:
|
macOS-latest-cmake-x64:
|
||||||
runs-on: macos-latest
|
runs-on: macos-12
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -103,12 +103,10 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
|
@ -241,8 +239,8 @@ jobs:
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||||
echo "Fetch llama2c model"
|
echo "Fetch llama2c model"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||||
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||||
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -684,7 +682,7 @@ jobs:
|
||||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OPENBLAS_VERSION: 0.3.23
|
OPENBLAS_VERSION: 0.3.23
|
||||||
|
@ -829,7 +827,7 @@ jobs:
|
||||||
name: llama-bin-win-${{ matrix.build }}.zip
|
name: llama-bin-win-${{ matrix.build }}.zip
|
||||||
|
|
||||||
windows-latest-cmake-cuda:
|
windows-latest-cmake-cuda:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
@ -843,8 +841,9 @@ jobs:
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- uses: Jimver/cuda-toolkit@v0.2.11
|
- name: Install CUDA toolkit
|
||||||
id: cuda-toolkit
|
id: cuda-toolkit
|
||||||
|
uses: Jimver/cuda-toolkit@v0.2.15
|
||||||
with:
|
with:
|
||||||
cuda: ${{ matrix.cuda }}
|
cuda: ${{ matrix.cuda }}
|
||||||
method: 'network'
|
method: 'network'
|
||||||
|
|
16
.github/workflows/docker.yml
vendored
16
.github/workflows/docker.yml
vendored
|
@ -30,20 +30,20 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||||
# have disabled them for now until the reason why
|
# have disabled them for now until the reason why
|
||||||
# is understood.
|
# is understood.
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
10
.github/workflows/server.yml
vendored
10
.github/workflows/server.yml
vendored
|
@ -16,11 +16,9 @@ on:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
pull_request_target:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
schedule:
|
|
||||||
- cron: '2 4 * * *'
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
|
@ -98,7 +96,7 @@ jobs:
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
|
@ -115,7 +113,7 @@ jobs:
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -138,7 +136,7 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
id: setup_python
|
id: setup_python
|
||||||
|
|
43
.gitignore
vendored
43
.gitignore
vendored
|
@ -46,48 +46,9 @@ models/*
|
||||||
models-mnt
|
models-mnt
|
||||||
|
|
||||||
/Pipfile
|
/Pipfile
|
||||||
/baby-llama
|
|
||||||
/beam-search
|
|
||||||
/benchmark-matmult
|
|
||||||
/convert-llama2c-to-ggml
|
|
||||||
/embd-input-test
|
|
||||||
/embedding
|
|
||||||
/eval-callback
|
|
||||||
/gguf
|
|
||||||
/gguf-llama-simple
|
|
||||||
/gguf-split
|
|
||||||
/gritlm
|
|
||||||
/imatrix
|
|
||||||
/infill
|
|
||||||
/libllama.so
|
/libllama.so
|
||||||
/llama-bench
|
/llama-*
|
||||||
/llava-cli
|
llama-batched-swift
|
||||||
/lookahead
|
|
||||||
/lookup
|
|
||||||
/lookup-create
|
|
||||||
/lookup-merge
|
|
||||||
/lookup-stats
|
|
||||||
/main
|
|
||||||
/metal
|
|
||||||
/passkey
|
|
||||||
/perplexity
|
|
||||||
/q8dot
|
|
||||||
/quantize
|
|
||||||
/quantize-stats
|
|
||||||
/result
|
|
||||||
/save-load-state
|
|
||||||
/server
|
|
||||||
/simple
|
|
||||||
/batched
|
|
||||||
/batched-bench
|
|
||||||
/export-lora
|
|
||||||
/finetune
|
|
||||||
/retrieval
|
|
||||||
/speculative
|
|
||||||
/parallel
|
|
||||||
/train-text-from-scratch
|
|
||||||
/tokenize
|
|
||||||
/vdot
|
|
||||||
/common/build-info.cpp
|
/common/build-info.cpp
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
|
@ -39,8 +39,12 @@ endif()
|
||||||
|
|
||||||
if (APPLE)
|
if (APPLE)
|
||||||
set(LLAMA_METAL_DEFAULT ON)
|
set(LLAMA_METAL_DEFAULT ON)
|
||||||
|
set(LLAMA_BLAS_DEFAULT ON)
|
||||||
|
set(LLAMA_BLAS_VENDOR_DEFAULT "Apple")
|
||||||
else()
|
else()
|
||||||
set(LLAMA_METAL_DEFAULT OFF)
|
set(LLAMA_METAL_DEFAULT OFF)
|
||||||
|
set(LLAMA_BLAS_DEFAULT OFF)
|
||||||
|
set(LLAMA_BLAS_VENDOR_DEFAULT "Generic")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
||||||
|
@ -91,9 +95,10 @@ endif()
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT})
|
||||||
|
set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||||
|
"llama: BLAS library vendor")
|
||||||
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
|
||||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
|
@ -311,9 +316,9 @@ if (LLAMA_BLAS)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
set(BLA_STATIC ON)
|
set(BLA_STATIC ON)
|
||||||
endif()
|
endif()
|
||||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
||||||
set(BLA_SIZEOF_INTEGER 8)
|
# set(BLA_SIZEOF_INTEGER 8)
|
||||||
endif()
|
#endif()
|
||||||
|
|
||||||
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
||||||
find_package(BLAS)
|
find_package(BLAS)
|
||||||
|
@ -321,7 +326,7 @@ if (LLAMA_BLAS)
|
||||||
if (BLAS_FOUND)
|
if (BLAS_FOUND)
|
||||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||||
|
|
||||||
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
|
if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple"))
|
||||||
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
||||||
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||||
find_package(PkgConfig REQUIRED)
|
find_package(PkgConfig REQUIRED)
|
||||||
|
@ -374,12 +379,15 @@ if (LLAMA_BLAS)
|
||||||
|
|
||||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
add_compile_definitions(GGML_USE_BLAS)
|
||||||
|
|
||||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
||||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(GGML_HEADERS_BLAS ggml-blas.h)
|
||||||
|
set(GGML_SOURCES_BLAS ggml-blas.cpp)
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||||
else()
|
else()
|
||||||
|
@ -402,12 +410,26 @@ if (LLAMA_CUBLAS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUDA)
|
if (LLAMA_CUDA)
|
||||||
cmake_minimum_required(VERSION 3.17)
|
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||||
|
|
||||||
find_package(CUDAToolkit)
|
find_package(CUDAToolkit)
|
||||||
if (CUDAToolkit_FOUND)
|
if (CUDAToolkit_FOUND)
|
||||||
message(STATUS "CUDA found")
|
message(STATUS "CUDA found")
|
||||||
|
|
||||||
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
|
# 52 == lowest CUDA 12 standard
|
||||||
|
# 60 == f16 CUDA intrinsics
|
||||||
|
# 61 == integer CUDA intrinsics
|
||||||
|
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||||
|
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
|
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
||||||
|
@ -472,21 +494,6 @@ if (LLAMA_CUDA)
|
||||||
else()
|
else()
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
|
||||||
# 52 == lowest CUDA 12 standard
|
|
||||||
# 60 == f16 CUDA intrinsics
|
|
||||||
# 61 == integer CUDA intrinsics
|
|
||||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
|
||||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
|
||||||
else()
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
|
||||||
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
|
||||||
|
|
||||||
else()
|
else()
|
||||||
message(WARNING "CUDA not found")
|
message(WARNING "CUDA not found")
|
||||||
endif()
|
endif()
|
||||||
|
@ -677,7 +684,8 @@ if (LLAMA_SYCL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(GGML_HEADERS_SYCL ggml-sycl.h)
|
set(GGML_HEADERS_SYCL ggml-sycl.h)
|
||||||
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
|
||||||
|
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
||||||
|
@ -1259,6 +1267,7 @@ add_library(ggml OBJECT
|
||||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||||
|
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
|
||||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
14
CONTRIBUTING.md
Normal file
14
CONTRIBUTING.md
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# Contributing Guidelines
|
||||||
|
|
||||||
|
## Checklist
|
||||||
|
|
||||||
|
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
|
||||||
|
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||||
|
* Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
|
|
||||||
|
## PR formatting
|
||||||
|
|
||||||
|
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||||
|
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
|
||||||
|
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
|
||||||
|
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
|
170
Makefile
170
Makefile
|
@ -1,8 +1,45 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
libllava.a \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
llama-baby-llama \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
llama-batched \
|
||||||
|
llama-batched-bench \
|
||||||
|
llama-bench \
|
||||||
|
llama-benchmark-matmult \
|
||||||
|
llama-cli \
|
||||||
|
llama-convert-llama2c-to-ggml \
|
||||||
|
llama-embedding \
|
||||||
|
llama-eval-callback \
|
||||||
|
llama-export-lora \
|
||||||
|
llama-finetune \
|
||||||
|
llama-gbnf-validator \
|
||||||
|
llama-gguf \
|
||||||
|
llama-gguf-split \
|
||||||
|
llama-gritlm \
|
||||||
|
llama-imatrix \
|
||||||
|
llama-infill \
|
||||||
|
llama-llava-cli \
|
||||||
|
llama-lookahead \
|
||||||
|
llama-lookup \
|
||||||
|
llama-lookup-create \
|
||||||
|
llama-lookup-merge \
|
||||||
|
llama-lookup-stats \
|
||||||
|
llama-parallel \
|
||||||
|
llama-passkey \
|
||||||
|
llama-perplexity \
|
||||||
|
llama-q8dot \
|
||||||
|
llama-quantize \
|
||||||
|
llama-quantize-stats \
|
||||||
|
llama-retrieval \
|
||||||
|
llama-save-load-state \
|
||||||
|
llama-server \
|
||||||
|
llama-simple \
|
||||||
|
llama-speculative \
|
||||||
|
llama-tokenize \
|
||||||
|
llama-train-text-from-scratch \
|
||||||
|
llama-vdot \
|
||||||
|
llama-cvector-generator \
|
||||||
|
tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
|
@ -404,10 +441,11 @@ ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac OS - include Accelerate framework.
|
# Mac OS - include Accelerate framework.
|
||||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
|
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
|
||||||
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
||||||
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
||||||
MK_LDFLAGS += -framework Accelerate
|
MK_LDFLAGS += -framework Accelerate
|
||||||
|
OBJS += ggml-blas.o
|
||||||
endif
|
endif
|
||||||
endif # LLAMA_NO_ACCELERATE
|
endif # LLAMA_NO_ACCELERATE
|
||||||
|
|
||||||
|
@ -418,21 +456,30 @@ ifndef LLAMA_NO_OPENMP
|
||||||
endif # LLAMA_NO_OPENMP
|
endif # LLAMA_NO_OPENMP
|
||||||
|
|
||||||
ifdef LLAMA_OPENBLAS
|
ifdef LLAMA_OPENBLAS
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
|
||||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
||||||
|
OBJS += ggml-blas.o
|
||||||
endif # LLAMA_OPENBLAS
|
endif # LLAMA_OPENBLAS
|
||||||
|
|
||||||
|
ifdef LLAMA_OPENBLAS64
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
|
||||||
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
|
||||||
|
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
|
||||||
|
OBJS += ggml-blas.o
|
||||||
|
endif # LLAMA_OPENBLAS64
|
||||||
|
|
||||||
|
ifdef LLAMA_BLIS
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||||
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||||
|
OBJS += ggml-blas.o
|
||||||
|
endif # LLAMA_BLIS
|
||||||
|
|
||||||
ifndef LLAMA_NO_LLAMAFILE
|
ifndef LLAMA_NO_LLAMAFILE
|
||||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||||
OBJS += sgemm.o
|
OBJS += sgemm.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_BLIS
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
|
||||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
|
||||||
endif # LLAMA_BLIS
|
|
||||||
|
|
||||||
ifdef LLAMA_RPC
|
ifdef LLAMA_RPC
|
||||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||||
OBJS += ggml-rpc.o
|
OBJS += ggml-rpc.o
|
||||||
|
@ -740,6 +787,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
ggml-blas.o: ggml-blas.cpp ggml-blas.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
unicode.o: unicode.cpp unicode.h
|
unicode.o: unicode.cpp unicode.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
@ -777,7 +827,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||||
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||||
rm -vrf ggml-cuda/*.o
|
rm -vrf ggml-cuda/*.o
|
||||||
rm -vrf ggml-cuda/template-instances/*.o
|
rm -vrf ggml-cuda/template-instances/*.o
|
||||||
find examples pocs -type f -name "*.o" -delete
|
find examples pocs -type f -name "*.o" -delete
|
||||||
|
@ -793,62 +843,62 @@ clean:
|
||||||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||||
|
|
||||||
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./llama-cli -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
|
@ -861,23 +911,27 @@ examples/server/%.hpp: examples/server/public/% Makefile
|
||||||
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
||||||
) > $@
|
) > $@
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -888,55 +942,61 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||||
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||||
|
|
||||||
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
|
||||||
|
|
||||||
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -962,20 +1022,20 @@ build-info.o: common/build-info.cpp
|
||||||
|
|
||||||
tests: $(TEST_TARGETS)
|
tests: $(TEST_TARGETS)
|
||||||
|
|
||||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
run-benchmark-matmult: benchmark-matmult
|
run-benchmark-matmult: llama-benchmark-matmult
|
||||||
./$@
|
./$@
|
||||||
|
|
||||||
.PHONY: run-benchmark-matmult swift
|
.PHONY: run-benchmark-matmult swift
|
||||||
|
|
||||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -77,7 +77,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- **Memory**
|
- **Memory**
|
||||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
|
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||||
|
|
||||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||||
|
|
||||||
|
@ -99,14 +99,14 @@ The docker build option is currently limited to *intel GPU* targets.
|
||||||
### Build image
|
### Build image
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
||||||
|
|
||||||
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
|
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||||
|
|
||||||
### Run container
|
### Run container
|
||||||
|
|
||||||
|
@ -275,7 +275,7 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./build/bin/ls-sycl-device
|
./build/bin/llama-ls-sycl-device
|
||||||
```
|
```
|
||||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
||||||
```
|
```
|
||||||
|
@ -313,7 +313,7 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||||
```
|
```
|
||||||
or run by script:
|
or run by script:
|
||||||
|
|
||||||
|
@ -324,7 +324,7 @@ or run by script:
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
Otherwise, you can run the script:
|
Otherwise, you can run the script:
|
||||||
|
@ -427,7 +427,7 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
|
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
|
@ -488,13 +488,13 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||||
```
|
```
|
||||||
Otherwise, run the following wrapper script:
|
Otherwise, run the following wrapper script:
|
||||||
|
|
||||||
|
|
69
README.md
69
README.md
|
@ -10,6 +10,9 @@
|
||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||||
|
|
||||||
### Recent API changes
|
### Recent API changes
|
||||||
|
|
||||||
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
||||||
|
@ -53,7 +56,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
<li><a href="#quantization">Quantization</a></li>
|
<li><a href="#quantization">Quantization</a></li>
|
||||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||||
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
||||||
<li><a href="#instruct-mode">Instruct mode</a></li>
|
|
||||||
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
|
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
|
||||||
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
|
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
|
||||||
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
|
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
|
||||||
|
@ -218,7 +220,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
|
@ -556,7 +558,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Build the image
|
# Build the image
|
||||||
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
|
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
|
||||||
|
|
||||||
# Then, use it:
|
# Then, use it:
|
||||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
@ -577,7 +579,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
vulkaninfo
|
vulkaninfo
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
Alternatively your package manager might be able to provide the appropriate libraries.
|
||||||
|
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||||
|
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||||
|
|
||||||
Then, build llama.cpp using the cmake command below:
|
Then, build llama.cpp using the cmake command below:
|
||||||
|
|
||||||
|
@ -585,7 +589,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
cmake -B build -DLLAMA_VULKAN=1
|
cmake -B build -DLLAMA_VULKAN=1
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||||
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||||
|
|
||||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||||
|
@ -618,21 +622,18 @@ python3 -m pip install -r requirements.txt
|
||||||
# convert the model to ggml FP16 format
|
# convert the model to ggml FP16 format
|
||||||
python3 convert-hf-to-gguf.py models/mymodel/
|
python3 convert-hf-to-gguf.py models/mymodel/
|
||||||
|
|
||||||
# [Optional] for models using BPE tokenizers
|
|
||||||
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
|
|
||||||
|
|
||||||
# quantize the model to 4-bits (using Q4_K_M method)
|
# quantize the model to 4-bits (using Q4_K_M method)
|
||||||
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
|
|
||||||
# update the gguf filetype to current version if older version is now unsupported
|
# update the gguf filetype to current version if older version is now unsupported
|
||||||
./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
||||||
```
|
```
|
||||||
|
|
||||||
### Run the quantized model
|
### Run the quantized model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# start inference on a gguf model
|
# start inference on a gguf model
|
||||||
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
@ -707,7 +708,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
|
||||||
#### How to run
|
#### How to run
|
||||||
|
|
||||||
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||||
3. Output:
|
3. Output:
|
||||||
```
|
```
|
||||||
perplexity : calculating perplexity over 655 chunks
|
perplexity : calculating perplexity over 655 chunks
|
||||||
|
@ -731,16 +732,16 @@ Here is an example of a few-shot interaction, invoked with the command
|
||||||
./examples/chat-13B.sh
|
./examples/chat-13B.sh
|
||||||
|
|
||||||
# custom arguments using a 13B model
|
# custom arguments using a 13B model
|
||||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Persistent Interaction
|
### Persistent Interaction
|
||||||
|
|
||||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start a new chat
|
# Start a new chat
|
||||||
|
@ -762,41 +763,13 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
```
|
```
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
|
||||||
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
||||||
|
|
||||||
### Instruct mode
|
|
||||||
|
|
||||||
1. First, download and place the `ggml` model into the `./models` folder
|
|
||||||
2. Run the `main` tool like this:
|
|
||||||
|
|
||||||
```
|
|
||||||
./examples/alpaca.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
Sample run:
|
|
||||||
|
|
||||||
```
|
|
||||||
== Running in interactive mode. ==
|
|
||||||
- Press Ctrl+C to interject at any time.
|
|
||||||
- Press Return to return control to LLaMA.
|
|
||||||
- If you want to submit another line, end your input in '\'.
|
|
||||||
|
|
||||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
|
||||||
|
|
||||||
> How many letters are there in the English alphabet?
|
|
||||||
There 26 letters in the English Alphabet
|
|
||||||
> What is the most common way of transportation in Amsterdam?
|
|
||||||
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
|
||||||
> List 5 words that start with "ca".
|
|
||||||
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
|
||||||
>
|
|
||||||
```
|
|
||||||
|
|
||||||
### Obtaining and using the Facebook LLaMA 2 model
|
### Obtaining and using the Facebook LLaMA 2 model
|
||||||
|
|
||||||
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
|
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
|
||||||
|
@ -869,7 +842,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
|
||||||
Now, you can start chatting:
|
Now, you can start chatting:
|
||||||
```
|
```
|
||||||
$cd /data/data/com.termux/files/home/bin
|
$cd /data/data/com.termux/files/home/bin
|
||||||
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's a demo of an interactive session running on Pixel 5 phone:
|
Here's a demo of an interactive session running on Pixel 5 phone:
|
||||||
|
@ -936,8 +909,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
|
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
@ -987,7 +960,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
|
||||||
|
|
||||||
### Docs
|
### Docs
|
||||||
|
|
||||||
- [main](./examples/main/README.md)
|
- [main (cli)](./examples/main/README.md)
|
||||||
- [server](./examples/server/README.md)
|
- [server](./examples/server/README.md)
|
||||||
- [jeopardy](./examples/jeopardy/README.md)
|
- [jeopardy](./examples/jeopardy/README.md)
|
||||||
- [BLIS](./docs/BLIS.md)
|
- [BLIS](./docs/BLIS.md)
|
||||||
|
|
224
ci/run.sh
224
ci/run.sh
|
@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -437,45 +437,45 @@ function gg_run_pythia_1_4b {
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -569,47 +569,47 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -693,10 +693,10 @@ function gg_run_embd_bge_small {
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
|
@ -1576,6 +1576,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
params.out_file = argv[i];
|
params.out_file = argv[i];
|
||||||
|
params.cvector_outfile = argv[i];
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "-ofreq" || arg == "--output-frequency") {
|
if (arg == "-ofreq" || arg == "--output-frequency") {
|
||||||
|
@ -1610,6 +1611,55 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.i_chunk = std::stoi(argv[i]);
|
params.i_chunk = std::stoi(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// cvector params
|
||||||
|
if (arg == "--completions-file") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.cvector_completions_file = argv[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--positive-file") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.cvector_positive_file = argv[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--negative-file") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.cvector_negative_file = argv[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--completions") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.n_completions = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--pca-batch") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.n_pca_batch = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--pca-iter") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.n_pca_iterations = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
// Parse args for logging parameters
|
// Parse args for logging parameters
|
||||||
if (log_param_single_parse(argv[i])) {
|
if (log_param_single_parse(argv[i])) {
|
||||||
|
@ -1931,6 +1981,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
options.push_back({ "cvector" });
|
||||||
|
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
||||||
|
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
||||||
|
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
||||||
|
options.push_back({ "cvector", " --completions-file FNAME",
|
||||||
|
"completions file (default: '%s')", params.cvector_completions_file.c_str() });
|
||||||
|
options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
|
||||||
|
options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
||||||
|
options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
|
||||||
for (const auto & o : options) {
|
for (const auto & o : options) {
|
||||||
|
|
|
@ -232,6 +232,15 @@ struct gpt_params {
|
||||||
|
|
||||||
bool process_output = false; // collect data for the output tensor
|
bool process_output = false; // collect data for the output tensor
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
||||||
|
// cvector-generator params
|
||||||
|
int n_completions = 64;
|
||||||
|
int n_pca_batch = 20;
|
||||||
|
int n_pca_iterations = 1000;
|
||||||
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
|
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
|
||||||
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
|
|
|
@ -40,7 +40,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string SPACE_RULE = "\" \"?";
|
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||||
|
|
||||||
struct BuiltinRule {
|
struct BuiltinRule {
|
||||||
std::string content;
|
std::string content;
|
||||||
|
@ -57,7 +57,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
||||||
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
||||||
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
||||||
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
|
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
|
||||||
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
|
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
|
||||||
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
||||||
{"null", {"\"null\" space", {}}},
|
{"null", {"\"null\" space", {}}},
|
||||||
};
|
};
|
||||||
|
|
|
@ -83,6 +83,7 @@ models = [
|
||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
|
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -481,6 +481,9 @@ class Model:
|
||||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||||
res = "smaug-bpe"
|
res = "smaug-bpe"
|
||||||
|
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
||||||
|
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
||||||
|
res = "poro-chat"
|
||||||
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
||||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
||||||
res = "jina-v2-code"
|
res = "jina-v2-code"
|
||||||
|
|
|
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
## GGUF specification
|
## GGUF specification
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
## Verifying that the model is running on the GPU with CUDA
|
## Verifying that the model is running on the GPU with CUDA
|
||||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||||
```shell
|
```shell
|
||||||
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||||
```
|
```
|
||||||
|
|
||||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||||
|
@ -27,7 +27,7 @@ RAM: 32GB
|
||||||
|
|
||||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
||||||
|
|
||||||
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||||
|
|
||||||
Result:
|
Result:
|
||||||
|
|
||||||
|
|
|
@ -12,43 +12,45 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
|
add_subdirectory(cvector-generator)
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
add_subdirectory(batched)
|
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
|
add_subdirectory(batched)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
add_subdirectory(export-lora)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gbnf-validator)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
add_subdirectory(gguf)
|
||||||
|
add_subdirectory(gritlm)
|
||||||
|
add_subdirectory(imatrix)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
if (LLAMA_SYCL)
|
|
||||||
add_subdirectory(sycl)
|
|
||||||
endif()
|
|
||||||
add_subdirectory(main)
|
|
||||||
add_subdirectory(tokenize)
|
|
||||||
add_subdirectory(parallel)
|
|
||||||
add_subdirectory(perplexity)
|
|
||||||
add_subdirectory(quantize)
|
|
||||||
add_subdirectory(quantize-stats)
|
|
||||||
add_subdirectory(retrieval)
|
|
||||||
add_subdirectory(save-load-state)
|
|
||||||
add_subdirectory(simple)
|
|
||||||
add_subdirectory(passkey)
|
|
||||||
add_subdirectory(speculative)
|
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(gguf)
|
add_subdirectory(main)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(passkey)
|
||||||
if (LLAMA_BUILD_SERVER)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(server)
|
add_subdirectory(quantize-stats)
|
||||||
endif()
|
add_subdirectory(quantize)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(retrieval)
|
||||||
if (LLAMA_RPC)
|
if (LLAMA_RPC)
|
||||||
add_subdirectory(rpc)
|
add_subdirectory(rpc)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_BUILD_SERVER)
|
||||||
|
add_subdirectory(server)
|
||||||
|
endif()
|
||||||
|
if (LLAMA_SYCL)
|
||||||
|
add_subdirectory(sycl)
|
||||||
|
endif()
|
||||||
|
add_subdirectory(save-load-state)
|
||||||
|
add_subdirectory(simple)
|
||||||
|
add_subdirectory(speculative)
|
||||||
|
add_subdirectory(tokenize)
|
||||||
|
add_subdirectory(train-text-from-scratch)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./llama-cli "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--in-prefix " " \
|
--in-prefix " " \
|
||||||
--in-suffix "${AI_NAME}:" \
|
--in-suffix "${AI_NAME}:" \
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#
|
|
||||||
# Temporary script - will be removed in the future
|
|
||||||
#
|
|
||||||
|
|
||||||
cd `dirname $0`
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
|
||||||
--color \
|
|
||||||
-f ./prompts/alpaca.txt \
|
|
||||||
--ctx_size 2048 \
|
|
||||||
-n -1 \
|
|
||||||
-ins -b 256 \
|
|
||||||
--top_k 10000 \
|
|
||||||
--temp 0.2 \
|
|
||||||
--repeat_penalty 1.1 \
|
|
||||||
-t 7
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET baby-llama)
|
set(TARGET llama-baby-llama)
|
||||||
add_executable(${TARGET} baby-llama.cpp)
|
add_executable(${TARGET} baby-llama.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -58,4 +58,4 @@ echo "$2
|
||||||
model=$1
|
model=$1
|
||||||
|
|
||||||
# generate the most likely continuation until the string "===" is found
|
# generate the most likely continuation until the string "===" is found
|
||||||
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched-bench)
|
set(TARGET llama-batched-bench)
|
||||||
add_executable(${TARGET} batched-bench.cpp)
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
||||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||||
|
|
||||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||||
|
|
||||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||||
|
|
||||||
# custom set of batches
|
# custom set of batches
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sample results
|
## Sample results
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
.PHONY: build
|
.PHONY: build
|
||||||
|
|
||||||
build:
|
build:
|
||||||
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
rm -f ./batched_swift
|
rm -f ./llama-batched-swift
|
||||||
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
platforms: [.macOS(.v12)],
|
platforms: [.macOS(.v12)],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(name: "llama", path: "../../"),
|
.package(name: "llama", path: "../../"),
|
||||||
|
@ -13,7 +13,7 @@ let package = Package(
|
||||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
// Targets can depend on other targets in this package and products from dependencies.
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
.executableTarget(
|
.executableTarget(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
dependencies: ["llama"],
|
dependencies: ["llama"],
|
||||||
path: "Sources",
|
path: "Sources",
|
||||||
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
This is a swift clone of `examples/batched`.
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
$ `make`
|
$ `make`
|
||||||
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched)
|
set(TARGET llama-batched)
|
||||||
add_executable(${TARGET} batched.cpp)
|
add_executable(${TARGET} batched.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
The example demonstrates batched generation from a given prompt
|
The example demonstrates batched generation from a given prompt
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET benchmark)
|
set(TARGET llama-bench-matmult)
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./main $GEN_OPTIONS \
|
./llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -62,7 +62,7 @@ fi
|
||||||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||||
echo 'Prompt cache does not exist, building...'
|
echo 'Prompt cache does not exist, building...'
|
||||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||||
./main 2>>"$LOG" \
|
./llama-cli 2>>"$LOG" \
|
||||||
--batch_size 64 \
|
--batch_size 64 \
|
||||||
"${OPTS[@]}" \
|
"${OPTS[@]}" \
|
||||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||||
|
@ -109,13 +109,13 @@ while read -e line; do
|
||||||
|
|
||||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||||
|
|
||||||
./main 2>>"$LOG" "${OPTS[@]}" \
|
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
|
||||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||||
--prompt-cache-all \
|
--prompt-cache-all \
|
||||||
--file "$CUR_PROMPT_FILE" \
|
--file "$CUR_PROMPT_FILE" \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--n_predict "$n_predict" |
|
--n_predict "$n_predict" |
|
||||||
skip_bytes 1 | # skip BOS token added by ./main
|
skip_bytes 1 | # skip BOS token added by ./llama-cli
|
||||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||||
skip_bytes "$n_prompt_len_pre" # print generation
|
skip_bytes "$n_prompt_len_pre" # print generation
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ while read -e line; do
|
||||||
# TODO get both messages in one go
|
# TODO get both messages in one go
|
||||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ while read -e line; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Update cache for next prompt in background, ideally during user input
|
# Update cache for next prompt in background, ideally during user input
|
||||||
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||||
--file "$NEXT_PROMPT_FILE" \
|
--file "$NEXT_PROMPT_FILE" \
|
||||||
--n_predict 1 &
|
--n_predict 1 &
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./bin/main $GEN_OPTIONS \
|
./bin/llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -11,6 +11,6 @@ cd ..
|
||||||
#
|
#
|
||||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||||
#
|
#
|
||||||
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||||
--repeat_penalty 1.0 --color -i \
|
--repeat_penalty 1.0 --color -i \
|
||||||
-r "User:" -f prompts/chat-with-bob.txt
|
-r "User:" -f prompts/chat-with-bob.txt
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET convert-llama2c-to-ggml)
|
set(TARGET llama-convert-llama2c-to-ggml)
|
||||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
|
||||||
|
|
||||||
After successful compilation, following usage options are available:
|
After successful compilation, following usage options are available:
|
||||||
```
|
```
|
||||||
usage: ./convert-llama2c-to-ggml [options]
|
usage: ./llama-convert-llama2c-to-ggml [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -19,10 +19,10 @@ options:
|
||||||
|
|
||||||
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||||
|
|
||||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||||
|
|
||||||
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
||||||
|
|
||||||
Now you can use the model with a command like:
|
Now you can use the model with a command like:
|
||||||
|
|
||||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||||
|
|
5
examples/cvector-generator/CMakeLists.txt
Normal file
5
examples/cvector-generator/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET llama-cvector-generator)
|
||||||
|
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
34
examples/cvector-generator/README.md
Normal file
34
examples/cvector-generator/README.md
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# cvector-generator
|
||||||
|
|
||||||
|
This example demonstrates how to generate a control vector using gguf models.
|
||||||
|
|
||||||
|
Related PRs:
|
||||||
|
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
|
||||||
|
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
|
||||||
|
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# CPU only
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
|
||||||
|
|
||||||
|
# With GPU
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
||||||
|
|
||||||
|
# With advanced options
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
|
||||||
|
|
||||||
|
# To see help message
|
||||||
|
./cvector-generator -h
|
||||||
|
# Then, have a look at "cvector" section
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tips and tricks
|
||||||
|
|
||||||
|
If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
||||||
|
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
||||||
|
```
|
582
examples/cvector-generator/completions.txt
Normal file
582
examples/cvector-generator/completions.txt
Normal file
|
@ -0,0 +1,582 @@
|
||||||
|
|
||||||
|
That game
|
||||||
|
I can see
|
||||||
|
Hmm, this
|
||||||
|
I can relate to
|
||||||
|
Who is
|
||||||
|
I understand the
|
||||||
|
Ugh,
|
||||||
|
What the hell was
|
||||||
|
Hey, did anyone
|
||||||
|
Although
|
||||||
|
Thank you for choosing
|
||||||
|
What are you
|
||||||
|
Oh w
|
||||||
|
How dare you open
|
||||||
|
It was my pleasure
|
||||||
|
I'm hon
|
||||||
|
I appreciate that you
|
||||||
|
Are you k
|
||||||
|
Whoever left this
|
||||||
|
It's always
|
||||||
|
Ew,
|
||||||
|
Hey, I l
|
||||||
|
Hello? Is someone
|
||||||
|
I understand that
|
||||||
|
That poem
|
||||||
|
Aww, poor
|
||||||
|
Hey, it
|
||||||
|
Alright, who
|
||||||
|
I didn't
|
||||||
|
Well, life
|
||||||
|
The document
|
||||||
|
Oh no, this
|
||||||
|
I'm concerned
|
||||||
|
Hello, this is
|
||||||
|
This art
|
||||||
|
Hmm, this drink
|
||||||
|
Hi there!
|
||||||
|
It seems
|
||||||
|
Is
|
||||||
|
Good
|
||||||
|
I can't
|
||||||
|
Ex
|
||||||
|
Who are
|
||||||
|
I can see that
|
||||||
|
Wow,
|
||||||
|
Today is a
|
||||||
|
Hey friend
|
||||||
|
Sometimes friends
|
||||||
|
Oh, this old
|
||||||
|
The weather outside
|
||||||
|
This place is sur
|
||||||
|
I appreciate your input
|
||||||
|
Thank you for the
|
||||||
|
Look at
|
||||||
|
I'm disappoint
|
||||||
|
To my
|
||||||
|
How dare you
|
||||||
|
That's an
|
||||||
|
This piece of art
|
||||||
|
Eww
|
||||||
|
This park is
|
||||||
|
This is incredible
|
||||||
|
Oh no, someone
|
||||||
|
Exc
|
||||||
|
Well, it'
|
||||||
|
I warned
|
||||||
|
Hey, I understand
|
||||||
|
Hey, I saw
|
||||||
|
How dare you go
|
||||||
|
What the he
|
||||||
|
Hey
|
||||||
|
It's
|
||||||
|
Hello? Hello?
|
||||||
|
It
|
||||||
|
Oh no!
|
||||||
|
This is the perfect
|
||||||
|
Good morning,
|
||||||
|
Oh no, there
|
||||||
|
It's so
|
||||||
|
Yeah
|
||||||
|
Uh,
|
||||||
|
Hello everyone
|
||||||
|
Who turned off
|
||||||
|
The weather
|
||||||
|
Who'
|
||||||
|
Hey, this
|
||||||
|
Wait,
|
||||||
|
Eww, gross
|
||||||
|
Excuse
|
||||||
|
It seems like you
|
||||||
|
Thank you so
|
||||||
|
What happened?
|
||||||
|
Oh my g
|
||||||
|
I am deeply sad
|
||||||
|
I war
|
||||||
|
Okay, let'
|
||||||
|
Hey, that
|
||||||
|
That was a beautiful
|
||||||
|
Oh no! That
|
||||||
|
What happened
|
||||||
|
Hey there
|
||||||
|
The artist'
|
||||||
|
What?!
|
||||||
|
Hey, it'
|
||||||
|
I am disappoint
|
||||||
|
It seems like
|
||||||
|
Oh no! The
|
||||||
|
This park is a
|
||||||
|
If you
|
||||||
|
Yes! I did
|
||||||
|
It sounds
|
||||||
|
What
|
||||||
|
Who is it
|
||||||
|
Hmm, that
|
||||||
|
That's strange
|
||||||
|
Yeah, that was
|
||||||
|
That's interesting
|
||||||
|
This park
|
||||||
|
What the hell
|
||||||
|
Who is that
|
||||||
|
I feel like my
|
||||||
|
Oh well
|
||||||
|
What the hell is
|
||||||
|
Hello? Hello
|
||||||
|
To my dearest
|
||||||
|
Bless you!\"
|
||||||
|
Thank you for
|
||||||
|
Oh, looks like
|
||||||
|
Can you please
|
||||||
|
This place is
|
||||||
|
Eww, what
|
||||||
|
Bless you
|
||||||
|
Is everything
|
||||||
|
Hey, I just
|
||||||
|
Whoever left these
|
||||||
|
Well, that'
|
||||||
|
I feel
|
||||||
|
Hey, do you
|
||||||
|
It's sad
|
||||||
|
Oh no, it
|
||||||
|
Hey, that'
|
||||||
|
Oh my god,
|
||||||
|
Thank you,
|
||||||
|
Hello little one,
|
||||||
|
I apolog
|
||||||
|
Hey team, I
|
||||||
|
How dare you read
|
||||||
|
Who is this and
|
||||||
|
Whoever left
|
||||||
|
Hi there! W
|
||||||
|
A
|
||||||
|
If you have
|
||||||
|
I was
|
||||||
|
U
|
||||||
|
Bless
|
||||||
|
Well, this
|
||||||
|
Oh, I'
|
||||||
|
It's a
|
||||||
|
Eww,
|
||||||
|
Is everything okay?
|
||||||
|
Oh, I
|
||||||
|
Hello, can you
|
||||||
|
Al
|
||||||
|
That was a great
|
||||||
|
What are
|
||||||
|
I understand that not
|
||||||
|
Oh no, not
|
||||||
|
Who is it?\"
|
||||||
|
Hey, can we
|
||||||
|
Whoever is taking
|
||||||
|
I would love to
|
||||||
|
Hey, I noticed
|
||||||
|
Hey, could
|
||||||
|
I understand that there
|
||||||
|
Hello?
|
||||||
|
D
|
||||||
|
Oh man, I
|
||||||
|
Thank you so much
|
||||||
|
Oh no, my
|
||||||
|
Dear [Name
|
||||||
|
Uh
|
||||||
|
I remember
|
||||||
|
Hey, who
|
||||||
|
Well, it
|
||||||
|
Are you
|
||||||
|
I understand that it
|
||||||
|
Hey, is
|
||||||
|
I would
|
||||||
|
Who is this
|
||||||
|
Excuse me
|
||||||
|
Alright
|
||||||
|
I am thrilled
|
||||||
|
Sometimes friends have
|
||||||
|
Who the
|
||||||
|
It's interesting
|
||||||
|
I would love
|
||||||
|
E
|
||||||
|
Hello? Is anyone
|
||||||
|
Well, this is
|
||||||
|
This place
|
||||||
|
Well,
|
||||||
|
I warned you
|
||||||
|
Hey, watch where
|
||||||
|
Oh my
|
||||||
|
That'
|
||||||
|
Sometimes friends have different
|
||||||
|
I understand that everyone
|
||||||
|
What?
|
||||||
|
What do these notes
|
||||||
|
I can relate
|
||||||
|
I'm not
|
||||||
|
I understand
|
||||||
|
To my dear
|
||||||
|
Guys
|
||||||
|
Well
|
||||||
|
Hey, I appreciate
|
||||||
|
Wow, what
|
||||||
|
Dear
|
||||||
|
That melody
|
||||||
|
Who the hell
|
||||||
|
Today is
|
||||||
|
Hello little
|
||||||
|
Wow, look
|
||||||
|
That's great
|
||||||
|
Love is never wrong
|
||||||
|
I'm having
|
||||||
|
Whoa, did
|
||||||
|
Ugh
|
||||||
|
Can you please provide
|
||||||
|
I miss you,
|
||||||
|
I feel uncom
|
||||||
|
I know
|
||||||
|
Ugh, this
|
||||||
|
Hey, watch
|
||||||
|
Oh great, a
|
||||||
|
I didn
|
||||||
|
Okay
|
||||||
|
That game of char
|
||||||
|
Oh
|
||||||
|
I appreciate
|
||||||
|
Who's there
|
||||||
|
I am so
|
||||||
|
Oh great, someone
|
||||||
|
Hey, could you
|
||||||
|
I remember wondering
|
||||||
|
Wait, what?
|
||||||
|
What do
|
||||||
|
Hello? Can
|
||||||
|
Hey there,
|
||||||
|
That game of
|
||||||
|
This is incred
|
||||||
|
Oh my gosh
|
||||||
|
Oh great, f
|
||||||
|
I appreciate your
|
||||||
|
It sounds like
|
||||||
|
What the heck
|
||||||
|
Okay, I understand
|
||||||
|
Ew
|
||||||
|
I understand that this
|
||||||
|
Uh, hi
|
||||||
|
Hi everyone!
|
||||||
|
What the hell?
|
||||||
|
Thank you for your
|
||||||
|
Oh no, the
|
||||||
|
Wow, I
|
||||||
|
Who turned
|
||||||
|
Dear [
|
||||||
|
Whoever
|
||||||
|
This is a
|
||||||
|
Whoa, he
|
||||||
|
What in the world
|
||||||
|
Although the physical
|
||||||
|
Hello, who is
|
||||||
|
That's amaz
|
||||||
|
Hey, I know
|
||||||
|
Okay, that
|
||||||
|
Hi everyone
|
||||||
|
Hey, is everything
|
||||||
|
I understand your fr
|
||||||
|
Oh no, poor
|
||||||
|
Oh, look
|
||||||
|
Good morning
|
||||||
|
Ew, gross
|
||||||
|
Oh no, did
|
||||||
|
Look at the family
|
||||||
|
Hey team
|
||||||
|
Yes!
|
||||||
|
Hey, can I
|
||||||
|
Okay, that'
|
||||||
|
It's great
|
||||||
|
Love is
|
||||||
|
Hey, what
|
||||||
|
Good morning, world
|
||||||
|
Who is it?
|
||||||
|
That poem really reson
|
||||||
|
I
|
||||||
|
That's
|
||||||
|
I understand the task
|
||||||
|
Gu
|
||||||
|
Hello? Who'
|
||||||
|
This postcard is
|
||||||
|
Whoa,
|
||||||
|
Oh, that
|
||||||
|
I understand that I
|
||||||
|
Whoever is
|
||||||
|
Hello? Who is
|
||||||
|
I'm really
|
||||||
|
Wow, this
|
||||||
|
Can
|
||||||
|
This artwork really
|
||||||
|
This is a shame
|
||||||
|
I miss you too
|
||||||
|
Who are you?
|
||||||
|
Today is a difficult
|
||||||
|
Hey, just
|
||||||
|
Are you okay
|
||||||
|
I am
|
||||||
|
Hi,
|
||||||
|
Wow, that
|
||||||
|
Hey there! Can
|
||||||
|
Okay, stay
|
||||||
|
Oh great, just
|
||||||
|
Yeah,
|
||||||
|
Hello? Can you
|
||||||
|
Oh, looks
|
||||||
|
Thank you for sharing
|
||||||
|
I'm glad
|
||||||
|
Hey, is that
|
||||||
|
Hmm
|
||||||
|
It was my
|
||||||
|
It sounds like you
|
||||||
|
Wow, your
|
||||||
|
I was promised certain
|
||||||
|
That was such a
|
||||||
|
Thank
|
||||||
|
Excuse you
|
||||||
|
That was
|
||||||
|
Hey team,
|
||||||
|
I feel un
|
||||||
|
It was
|
||||||
|
What'
|
||||||
|
Hey friend, I
|
||||||
|
How
|
||||||
|
Saying goodbye
|
||||||
|
That
|
||||||
|
It's heart
|
||||||
|
How dare
|
||||||
|
Oh,
|
||||||
|
Hello, may
|
||||||
|
What's this
|
||||||
|
Thank you for recogn
|
||||||
|
Aww, that
|
||||||
|
Oh, I remember
|
||||||
|
Hmm, that'
|
||||||
|
I miss
|
||||||
|
I know this
|
||||||
|
Wait
|
||||||
|
Is everything okay
|
||||||
|
Who is that person
|
||||||
|
Wow, you
|
||||||
|
Oh great
|
||||||
|
I'm sad
|
||||||
|
Wow, the
|
||||||
|
I am very disappoint
|
||||||
|
Who turned off the
|
||||||
|
I understand that things
|
||||||
|
I'm very
|
||||||
|
Hi
|
||||||
|
That's very
|
||||||
|
Okay, I
|
||||||
|
Oh no,
|
||||||
|
Wow, there
|
||||||
|
What's wrong
|
||||||
|
I apologize for
|
||||||
|
Hey, I
|
||||||
|
Can I help you
|
||||||
|
Oh, I didn
|
||||||
|
Alright,
|
||||||
|
Oh wow,
|
||||||
|
Oh my goodness
|
||||||
|
I know this event
|
||||||
|
What in the
|
||||||
|
Saying
|
||||||
|
Yeah, that
|
||||||
|
Guys, I
|
||||||
|
Hey, this v
|
||||||
|
This post
|
||||||
|
Are
|
||||||
|
Hey, can
|
||||||
|
Hello? Is
|
||||||
|
I can only imagine
|
||||||
|
Oh, that sounds
|
||||||
|
Hey, is anyone
|
||||||
|
I am disappointed
|
||||||
|
Hello,
|
||||||
|
Hey everyone, I
|
||||||
|
That was such
|
||||||
|
It's okay
|
||||||
|
The artist
|
||||||
|
Whoa
|
||||||
|
I understand that mistakes
|
||||||
|
Can I help
|
||||||
|
Who
|
||||||
|
Hi everyone! I
|
||||||
|
Hey, can you
|
||||||
|
Wow, how
|
||||||
|
Today
|
||||||
|
Oh no, I
|
||||||
|
Oh well, I
|
||||||
|
Well, that
|
||||||
|
This is the
|
||||||
|
Yes! I finally
|
||||||
|
Hey there little
|
||||||
|
Hello everyone!
|
||||||
|
Love is never
|
||||||
|
Look at the
|
||||||
|
This postcard
|
||||||
|
Oh great,
|
||||||
|
Can I
|
||||||
|
Hmm, this is
|
||||||
|
I understand your
|
||||||
|
Oh, look at
|
||||||
|
B
|
||||||
|
I'm so
|
||||||
|
Whoa, this
|
||||||
|
W
|
||||||
|
Oh, this
|
||||||
|
Sometimes
|
||||||
|
This piece of
|
||||||
|
What the
|
||||||
|
That was a
|
||||||
|
Hey, do
|
||||||
|
Oh no
|
||||||
|
Whoa, what
|
||||||
|
I feel like I
|
||||||
|
The documentary
|
||||||
|
Hello
|
||||||
|
Hello little one
|
||||||
|
I understand that my
|
||||||
|
Eww, that
|
||||||
|
Wow, an
|
||||||
|
Yes! Finally,
|
||||||
|
Although the physical location
|
||||||
|
Whoever is watching
|
||||||
|
That movie
|
||||||
|
I remember wondering about
|
||||||
|
Hey there, little
|
||||||
|
Who's
|
||||||
|
Hello, who
|
||||||
|
Hello everyone! Thank
|
||||||
|
Hello, can
|
||||||
|
That's too
|
||||||
|
Hey, just wanted
|
||||||
|
Hey there, I
|
||||||
|
Saying good
|
||||||
|
Hey there!
|
||||||
|
Who is there?
|
||||||
|
Oh my good
|
||||||
|
I am very
|
||||||
|
Oh no, what
|
||||||
|
Wow, thank
|
||||||
|
I was promised
|
||||||
|
Hi, is
|
||||||
|
Hey, I'
|
||||||
|
Guys, the
|
||||||
|
Oh no, that
|
||||||
|
Who is there
|
||||||
|
Hello, this
|
||||||
|
That movie really touched
|
||||||
|
If you have something
|
||||||
|
The documentary was
|
||||||
|
I'm starting
|
||||||
|
Are you kidd
|
||||||
|
That movie really
|
||||||
|
Hey everyone,
|
||||||
|
Thank you for considering
|
||||||
|
I didn'
|
||||||
|
Yes! I
|
||||||
|
Can you
|
||||||
|
Oh my god
|
||||||
|
Hey, whoever
|
||||||
|
That melody really
|
||||||
|
Thank you, little
|
||||||
|
Hello, may I
|
||||||
|
Look
|
||||||
|
Wow, we
|
||||||
|
It looks
|
||||||
|
What do these
|
||||||
|
Oh wow
|
||||||
|
I apologize
|
||||||
|
What are you all
|
||||||
|
It's such
|
||||||
|
It's clear
|
||||||
|
Hey, I was
|
||||||
|
Hey friend,
|
||||||
|
I can only
|
||||||
|
The weather outside is
|
||||||
|
Eww, this
|
||||||
|
I miss you
|
||||||
|
Wow
|
||||||
|
Aww,
|
||||||
|
Hi, is there
|
||||||
|
This artwork
|
||||||
|
Okay,
|
||||||
|
Oh well,
|
||||||
|
This
|
||||||
|
I'
|
||||||
|
Say
|
||||||
|
Hey there little gu
|
||||||
|
Hmm,
|
||||||
|
Whoa, who
|
||||||
|
I am thr
|
||||||
|
Oh man
|
||||||
|
Okay, stay calm
|
||||||
|
I'm happy
|
||||||
|
Oh, this cur
|
||||||
|
Oh man,
|
||||||
|
I'm sorry
|
||||||
|
Hello? Who
|
||||||
|
What?! That
|
||||||
|
This piece
|
||||||
|
Hey everyone
|
||||||
|
That's so
|
||||||
|
Are you okay?
|
||||||
|
What happened? Where
|
||||||
|
Hi there
|
||||||
|
The
|
||||||
|
Who the hell entered
|
||||||
|
I can
|
||||||
|
Guys,
|
||||||
|
What's
|
||||||
|
What in
|
||||||
|
It's important
|
||||||
|
I'm
|
||||||
|
I'm coming
|
||||||
|
It'
|
||||||
|
Yes! Finally
|
||||||
|
Wait, what
|
||||||
|
Wow, reading
|
||||||
|
I'm surprised
|
||||||
|
Hey, did
|
||||||
|
Hey,
|
||||||
|
Okay, let
|
||||||
|
I understand that you
|
||||||
|
Who the hell threw
|
||||||
|
Eww, who
|
||||||
|
Thank you for thinking
|
||||||
|
Who is this?\"
|
||||||
|
I am deeply
|
||||||
|
Thank you for including
|
||||||
|
Oh no, an
|
||||||
|
It looks like you
|
||||||
|
Aww
|
||||||
|
I'm confused
|
||||||
|
Wow, it
|
||||||
|
That poem really
|
||||||
|
Yes
|
||||||
|
Hey there, is
|
||||||
|
Hey, what'
|
||||||
|
Thank you for remember
|
||||||
|
To
|
||||||
|
This is
|
||||||
|
Thank you for making
|
||||||
|
I can'
|
||||||
|
That mel
|
||||||
|
Wow, they
|
||||||
|
I feel like
|
||||||
|
Although the
|
||||||
|
Who are you
|
||||||
|
Love
|
||||||
|
If
|
||||||
|
What the hell are
|
||||||
|
I am so sad
|
||||||
|
Oh, I found
|
||||||
|
Thank you
|
||||||
|
It looks like
|
||||||
|
Well, life is
|
||||||
|
I appreciate that
|
||||||
|
The artist's
|
||||||
|
Whoa, that
|
||||||
|
It's never
|
499
examples/cvector-generator/cvector-generator.cpp
Normal file
499
examples/cvector-generator/cvector-generator.cpp
Normal file
|
@ -0,0 +1,499 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "pca.hpp"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
// utils
|
||||||
|
|
||||||
|
template <class Iter>
|
||||||
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
|
std::string ret;
|
||||||
|
for (; begin != end; ++begin) {
|
||||||
|
ret += llama_token_to_piece(ctx, *begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
|
printf("\nexample usage:\n");
|
||||||
|
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
||||||
|
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
|
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
// cb_eval is reused for each pair of positive - negative prompt
|
||||||
|
struct callback_data {
|
||||||
|
ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
|
||||||
|
|
||||||
|
int n_layers = 0;
|
||||||
|
int n_tokens = 0;
|
||||||
|
bool is_eval_pos = true;
|
||||||
|
|
||||||
|
// each element of the vector correspond to one layer
|
||||||
|
std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
|
||||||
|
std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
|
||||||
|
std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
|
||||||
|
|
||||||
|
// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
|
||||||
|
void save_tensor_for_layer(struct ggml_tensor * t) {
|
||||||
|
GGML_ASSERT(t->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
if (ctx_ggml == nullptr) {
|
||||||
|
// alloc a new ctx_ggml if needed
|
||||||
|
struct ggml_init_params params_ggml = {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx_ggml = ggml_init(params_ggml);
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy tensor data
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
|
||||||
|
t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
|
||||||
|
ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
|
||||||
|
ggml_set_name(t_layer, ggml_get_name(t));
|
||||||
|
//print_debug_tensor(t_layer);
|
||||||
|
|
||||||
|
if (is_eval_pos) {
|
||||||
|
v_pos.push_back(t_layer);
|
||||||
|
} else {
|
||||||
|
v_neg.push_back(t_layer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate diff (v_pos - v_neg) and place the result back to v_pos
|
||||||
|
// all zero rows in the diff tensor will also be removed
|
||||||
|
// NOTE: final layer is ignored. we only have (n_layers - 1) to process
|
||||||
|
std::vector<struct ggml_tensor *> calc_diff() {
|
||||||
|
for (float il = 0; il < v_pos.size(); il++) {
|
||||||
|
float * a = (float *) v_pos[il]->data;
|
||||||
|
float * b = (float *) v_neg[il]->data;
|
||||||
|
size_t n_elem = ggml_nelements(v_pos[il]);
|
||||||
|
for (size_t j = 0; j < n_elem; j++) {
|
||||||
|
a[j] -= b[j];
|
||||||
|
}
|
||||||
|
//print_debug_tensor(v_pos[i]);
|
||||||
|
auto diff_filtered = filter_nonzero_rows(v_pos[il]);
|
||||||
|
v_diff_filtered.push_back(diff_filtered);
|
||||||
|
}
|
||||||
|
return v_diff_filtered; // for convinient, we return the result std::vector
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete zero rows from a given 2D tensor
|
||||||
|
struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
|
||||||
|
//printf("filter_nonzero_rows\n");
|
||||||
|
auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
|
||||||
|
// check if given row containing all zero elements
|
||||||
|
int n_cols = t->ne[0]; // hint: should be equal to n_embd
|
||||||
|
for (int col = 0; col < n_cols; ++col) {
|
||||||
|
if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
|
||||||
|
for (int i_row = 0; i_row < a->ne[1]; i_row++) {
|
||||||
|
if (!is_row_all_zeros(a, i_row, 1e-6)) {
|
||||||
|
rows_to_copy.push_back(i_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get "n_nonzero_rows" for the output "diff_filtered"
|
||||||
|
int n_nonzero_rows = rows_to_copy.size();
|
||||||
|
//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
|
||||||
|
int n_embd = a->ne[0];
|
||||||
|
GGML_ASSERT(n_nonzero_rows > 0);
|
||||||
|
|
||||||
|
// diff_filtered: [n_embd, n_nonzero_rows]
|
||||||
|
struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
|
||||||
|
ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
|
||||||
|
ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
|
||||||
|
diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
|
||||||
|
|
||||||
|
// copy non-zero rows
|
||||||
|
for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
|
||||||
|
int src_row = rows_to_copy[dest_row];
|
||||||
|
for (int i = 0; i < n_embd; i++) {
|
||||||
|
float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
|
||||||
|
ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//print_debug_tensor(diff_filtered);
|
||||||
|
|
||||||
|
return diff_filtered;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
|
||||||
|
void reset() {
|
||||||
|
for (auto ptr : v_pos) free(ptr->data);
|
||||||
|
for (auto ptr : v_neg) free(ptr->data);
|
||||||
|
for (auto ptr : v_diff_filtered) free(ptr->data);
|
||||||
|
v_pos.clear();
|
||||||
|
v_neg.clear();
|
||||||
|
v_diff_filtered.clear();
|
||||||
|
if (ctx_ggml) {
|
||||||
|
ggml_free(ctx_ggml);
|
||||||
|
}
|
||||||
|
ctx_ggml = nullptr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* process_ctx is used to store the ggml context for pre-post processing the diff vectors
|
||||||
|
* in short, input => v_diff and output => v_final
|
||||||
|
*/
|
||||||
|
struct train_context {
|
||||||
|
ggml_context * ctx_ggml;
|
||||||
|
int n_embd;
|
||||||
|
int n_layers;
|
||||||
|
|
||||||
|
/* pair of prompts to be used for generating final vector */
|
||||||
|
std::vector<std::string> positive_entries;
|
||||||
|
std::vector<std::string> negative_entries;
|
||||||
|
|
||||||
|
// each element of the vector correspond to one layer
|
||||||
|
// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
|
||||||
|
// NOTE (2): v_diff is transposed from v_diff_tmp
|
||||||
|
std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
|
||||||
|
std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
|
||||||
|
|
||||||
|
// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
|
||||||
|
// v_diff_tmp will get converted unto v_diff later on
|
||||||
|
std::vector<std::vector<uint8_t>> v_diff_tmp;
|
||||||
|
|
||||||
|
train_context(int n_embd_, int n_layers_) {
|
||||||
|
n_embd = n_embd_;
|
||||||
|
n_layers = n_layers_;
|
||||||
|
struct ggml_init_params params_ggml = {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx_ggml = ggml_init(params_ggml);
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
std::vector<uint8_t> empty;
|
||||||
|
v_diff_tmp.push_back(empty);
|
||||||
|
auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
|
||||||
|
t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
|
||||||
|
v_final.push_back(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add new rows into existing tensor in v_diff_tmp
|
||||||
|
void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
|
||||||
|
GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
auto t = diff_filtered[il];
|
||||||
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
|
size_t curr_size = diff_tmp.size();
|
||||||
|
diff_tmp.resize(curr_size + ggml_nbytes(t));
|
||||||
|
memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
||||||
|
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
||||||
|
void build_v_diff() {
|
||||||
|
printf("build_v_diff\n");
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
|
int n_elem = diff_tmp.size() / sizeof(float);
|
||||||
|
GGML_ASSERT(n_elem % n_embd == 0);
|
||||||
|
int n_rows = n_elem / n_embd;
|
||||||
|
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
|
||||||
|
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
||||||
|
// copy data & transpose
|
||||||
|
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
||||||
|
float * arr = (float *) diff_tmp.data();
|
||||||
|
for (int ir = 0; ir < n_rows; ++ir) {
|
||||||
|
for (int ic = 0; ic < n_embd; ++ic) {
|
||||||
|
float f = arr[ir*n_embd + ic];
|
||||||
|
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
v_diff.push_back(diff);
|
||||||
|
print_debug_tensor(diff);
|
||||||
|
// free memory of diff_tmp
|
||||||
|
diff_tmp.resize(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~train_context() {
|
||||||
|
for (auto ptr : v_final) free(ptr->data);
|
||||||
|
for (auto ptr : v_diff) free(ptr->data);
|
||||||
|
// no need to free v_diff_tmp, since we didn't use malloc
|
||||||
|
ggml_free(ctx_ggml);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct tokenized_prompt {
|
||||||
|
std::vector<llama_token> tokens_pos;
|
||||||
|
std::vector<llama_token> tokens_neg;
|
||||||
|
size_t max_seq_len;
|
||||||
|
|
||||||
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
|
||||||
|
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
|
||||||
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||||
|
padding_seq(ctx, tokens_pos, max_seq_len);
|
||||||
|
padding_seq(ctx, tokens_neg, max_seq_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
|
||||||
|
// TODO: customize padding token
|
||||||
|
std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
|
||||||
|
llama_token pad_tok = pad_tokens.back();
|
||||||
|
while (tokens.size() < len) {
|
||||||
|
tokens.push_back(pad_tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static std::string to_string(const T & val) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << val;
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
|
||||||
|
std::vector<std::string> output;
|
||||||
|
std::ifstream file(path);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(file, line)) {
|
||||||
|
bool is_skip = skip_empty_lines && line.empty();
|
||||||
|
if (!is_skip) {
|
||||||
|
string_process_escapes(line);
|
||||||
|
output.push_back(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
auto * cb_data = (callback_data *) user_data;
|
||||||
|
static const char * l_out_name = "l_out";
|
||||||
|
const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
|
||||||
|
|
||||||
|
if (ask) {
|
||||||
|
return is_l_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// save the tensor to current context
|
||||||
|
cb_data->save_tensor_for_layer(t);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
|
||||||
|
struct gguf_context * ctx = gguf_init_empty();
|
||||||
|
|
||||||
|
const std::string arch = "controlvector";
|
||||||
|
gguf_set_val_str(ctx, "general.architecture", arch.c_str());
|
||||||
|
gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
|
||||||
|
gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < v_ctrl.size(); ++i) {
|
||||||
|
gguf_add_tensor(ctx, v_ctrl[i]);
|
||||||
|
print_debug_tensor(v_ctrl[i]);
|
||||||
|
printf("Added tensor: %s\n", v_ctrl[i]->name);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: writing file...\n", __func__);
|
||||||
|
gguf_write_to_file(ctx, fname.c_str(), false);
|
||||||
|
printf("%s: wrote file '%s'\n", __func__, fname.c_str());
|
||||||
|
gguf_free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load prompt files and completion file.
|
||||||
|
* Then format each pair of prompt + completion to make an entry.
|
||||||
|
*/
|
||||||
|
static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
|
// load prompts
|
||||||
|
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
|
||||||
|
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
|
||||||
|
if (positive_prompts.size() != negative_prompts.size()) {
|
||||||
|
fprintf(stderr, "number of positive and negative prompts must be equal\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (positive_prompts.empty()) {
|
||||||
|
fprintf(stderr, "must provide at least one prompt pair\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create templated prompts
|
||||||
|
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
||||||
|
auto format_template = [](std::string persona, std::string suffix) {
|
||||||
|
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
|
||||||
|
return persona + " " + suffix;
|
||||||
|
};
|
||||||
|
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
||||||
|
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
||||||
|
// TODO replicate the truncations done by the python implementation
|
||||||
|
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
|
||||||
|
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
print_usage(argc, argv, params);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.n_pca_iterations % params.n_pca_batch != 0) {
|
||||||
|
fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
callback_data cb_data;
|
||||||
|
|
||||||
|
// pass the callback to the backend scheduler
|
||||||
|
// it will be executed for each node during the graph computation
|
||||||
|
params.cb_eval = cb_eval;
|
||||||
|
params.cb_eval_user_data = &cb_data;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
|
print_build_info();
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
// load the model to get hparams
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// int n_ctx = llama_n_ctx(ctx);
|
||||||
|
int n_layers = llama_n_layer(model);
|
||||||
|
int n_embd = llama_n_embd(model);
|
||||||
|
// get model hint param (a.k.a model arch name)
|
||||||
|
char model_hint[128];
|
||||||
|
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
|
||||||
|
|
||||||
|
// init train_context
|
||||||
|
train_context ctx_train(n_embd, n_layers);
|
||||||
|
|
||||||
|
// load and prepare entries for training
|
||||||
|
prepare_entries(params, ctx_train);
|
||||||
|
|
||||||
|
// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
|
||||||
|
std::vector<tokenized_prompt> tokenized_prompts;
|
||||||
|
size_t n_total_tokens = 0;
|
||||||
|
for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
|
||||||
|
tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
|
||||||
|
n_total_tokens += 2 * t.max_seq_len;
|
||||||
|
tokenized_prompts.push_back(std::move(t));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
|
||||||
|
|
||||||
|
for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
|
||||||
|
bool success = false;
|
||||||
|
tokenized_prompt t = tokenized_prompts[i];
|
||||||
|
cb_data.n_layers = n_layers;
|
||||||
|
cb_data.n_tokens = t.max_seq_len;
|
||||||
|
|
||||||
|
printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
|
||||||
|
(int) i+1, (int) ctx_train.positive_entries.size(),
|
||||||
|
tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
|
||||||
|
tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
|
||||||
|
(int) t.max_seq_len);
|
||||||
|
|
||||||
|
cb_data.is_eval_pos = true;
|
||||||
|
success = get_hidden_layers(ctx, t.tokens_pos);
|
||||||
|
if (!success) break;
|
||||||
|
|
||||||
|
cb_data.is_eval_pos = false;
|
||||||
|
success = get_hidden_layers(ctx, t.tokens_neg);
|
||||||
|
if (!success) break;
|
||||||
|
|
||||||
|
// calculate diff and remove all zero rows
|
||||||
|
auto v_diff_filtered = cb_data.calc_diff();
|
||||||
|
|
||||||
|
// save & concat the filtered v_diff to ctx_train
|
||||||
|
ctx_train.concat_diff_tmp(v_diff_filtered);
|
||||||
|
|
||||||
|
// reset for next iteration
|
||||||
|
cb_data.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
// done with the model, we can now free it to make gain some memory
|
||||||
|
printf("Done evaluate prompts, unload model...\n");
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
// prepare ctx_train for PCA
|
||||||
|
ctx_train.build_v_diff();
|
||||||
|
|
||||||
|
// run PCA
|
||||||
|
PCA::pca_params pca_params;
|
||||||
|
pca_params.n_threads = params.n_threads;
|
||||||
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
|
||||||
|
// write output vectors to gguf
|
||||||
|
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
1
examples/cvector-generator/negative.txt
Normal file
1
examples/cvector-generator/negative.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
[INST] Act like a person who is extremely sad. [/INST]
|
322
examples/cvector-generator/pca.hpp
Normal file
322
examples/cvector-generator/pca.hpp
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <ctime>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
#define DEBUG_POS 5
|
||||||
|
|
||||||
|
static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
|
||||||
|
printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
|
||||||
|
if (!with_data) return;
|
||||||
|
printf("%s: %s[0] = [", __func__, t->name);
|
||||||
|
for (size_t i = 0; i <= DEBUG_POS; i++) {
|
||||||
|
printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
|
||||||
|
}
|
||||||
|
printf(" ... ]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace PCA {
|
||||||
|
|
||||||
|
// input params for PCA computations
|
||||||
|
struct pca_params {
|
||||||
|
int n_threads = 1;
|
||||||
|
int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
|
||||||
|
int n_iterations = 1000;
|
||||||
|
float tolerance = 1e-7;
|
||||||
|
|
||||||
|
// for debugging
|
||||||
|
int i_layer = 0;
|
||||||
|
int n_layers = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// result from each iteration
|
||||||
|
struct pca_result {
|
||||||
|
struct ggml_tensor * calculated_square = NULL;
|
||||||
|
std::vector<struct ggml_tensor *> eigenvectors;
|
||||||
|
std::vector<float> distances;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pca_model {
|
||||||
|
ggml_backend_t backend = NULL;
|
||||||
|
ggml_backend_buffer_t buffer;
|
||||||
|
struct ggml_context * ctx; // context to compute graph on target device
|
||||||
|
struct ggml_context * ctx_host; // host context to store results
|
||||||
|
|
||||||
|
// tensors on target device
|
||||||
|
struct ggml_tensor * dev_input;
|
||||||
|
struct ggml_tensor * dev_square;
|
||||||
|
struct ggml_tensor * dev_eigenvector;
|
||||||
|
|
||||||
|
pca_model(struct ggml_tensor * t_input) {
|
||||||
|
// TODO: enable GPU support when support for GGML_OP_SQRT is added
|
||||||
|
// #ifdef GGML_USE_CUDA
|
||||||
|
// fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||||
|
// backend = ggml_backend_cuda_init(0); // init device 0
|
||||||
|
// if (!backend) {
|
||||||
|
// fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||||
|
// }
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
// #ifdef GGML_USE_METAL
|
||||||
|
// fprintf(stderr, "%s: using Metal backend\n", __func__);
|
||||||
|
// backend = ggml_backend_metal_init();
|
||||||
|
// if (!backend) {
|
||||||
|
// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||||
|
// }
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
|
if (!backend) {
|
||||||
|
backend = ggml_backend_cpu_init();
|
||||||
|
}
|
||||||
|
|
||||||
|
const int num_tensors = 4;
|
||||||
|
struct ggml_init_params params {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx = ggml_init(params);
|
||||||
|
|
||||||
|
auto n_samples = t_input->ne[0];
|
||||||
|
auto n_embd = t_input->ne[1];
|
||||||
|
|
||||||
|
dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
|
||||||
|
dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||||
|
dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
ggml_set_name(dev_input, "dev_input");
|
||||||
|
ggml_set_name(dev_square, "dev_square");
|
||||||
|
ggml_set_name(dev_eigenvector, "dev_eigenvector");
|
||||||
|
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||||
|
ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
|
||||||
|
|
||||||
|
// initialize eigenvector to random normalized vector
|
||||||
|
{
|
||||||
|
std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
|
||||||
|
std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
|
||||||
|
std::uniform_real_distribution<float> distribution(0.0, 1.0);
|
||||||
|
float sum_sqr = 0.0; // for normalizing random_vec
|
||||||
|
for (size_t i = 0; i < random_vec.size(); ++i) {
|
||||||
|
float f = distribution(generator);
|
||||||
|
sum_sqr += f * f;
|
||||||
|
random_vec[i] = f;
|
||||||
|
}
|
||||||
|
// normalize it
|
||||||
|
float random_vec_norm = std::sqrt(sum_sqr);
|
||||||
|
for (size_t i = 0; i < random_vec.size(); ++i) {
|
||||||
|
random_vec[i] /= random_vec_norm;
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~pca_model() {
|
||||||
|
ggml_free(ctx);
|
||||||
|
ggml_backend_buffer_free(buffer);
|
||||||
|
ggml_backend_free(backend);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct ggml_cgraph * build_graph_piter(
|
||||||
|
const struct pca_params & params,
|
||||||
|
const pca_model & model,
|
||||||
|
bool calc_square = false) {
|
||||||
|
GGML_ASSERT(params.n_batch > 0);
|
||||||
|
// TODO: buf_size must be able to scale with params.n_batch
|
||||||
|
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
|
||||||
|
static std::vector<uint8_t> buf(buf_size);
|
||||||
|
|
||||||
|
struct ggml_init_params params0 = {
|
||||||
|
/*.mem_size =*/ buf_size,
|
||||||
|
/*.mem_buffer =*/ buf.data(),
|
||||||
|
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
|
||||||
|
};
|
||||||
|
// create a temporally context to build the graph
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params0);
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
// turn v_diff_original into square matrix if needed
|
||||||
|
struct ggml_tensor * tmp_square;
|
||||||
|
if (calc_square) {
|
||||||
|
tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
|
||||||
|
ggml_set_name(tmp_square, "tmp_square");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * b_tensor;
|
||||||
|
struct ggml_tensor * distance;
|
||||||
|
struct ggml_tensor * old_eigen = model.dev_eigenvector;
|
||||||
|
struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
|
||||||
|
|
||||||
|
for (int i = 0; i < params.n_batch; ++i) {
|
||||||
|
// b_tensor = square * eigenvector^T
|
||||||
|
b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
|
||||||
|
ggml_set_name(b_tensor, "b_tensor");
|
||||||
|
|
||||||
|
// normalize
|
||||||
|
b_tensor = ggml_div_inplace(ctx0,
|
||||||
|
b_tensor,
|
||||||
|
ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
|
||||||
|
);
|
||||||
|
ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
|
||||||
|
|
||||||
|
// calculate distance(new eigenvector - old eigenvector)
|
||||||
|
// we don't use ggml_sub because it may not be implemented on GPU backend
|
||||||
|
struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
|
||||||
|
distance = ggml_sqrt_inplace(ctx0,
|
||||||
|
ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
|
||||||
|
ggml_format_name(distance, "distance_%d", i);
|
||||||
|
|
||||||
|
old_eigen = b_tensor;
|
||||||
|
|
||||||
|
// build operations nodes
|
||||||
|
ggml_build_forward_expand(gf, distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete the temporally context used to build the graph
|
||||||
|
ggml_free(ctx0);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_status compute_piter(
|
||||||
|
const struct pca_params & params,
|
||||||
|
const pca_model & model,
|
||||||
|
struct ggml_cgraph * gf,
|
||||||
|
ggml_gallocr_t allocr,
|
||||||
|
struct pca_result & result) {
|
||||||
|
// allocate tensors
|
||||||
|
ggml_gallocr_alloc_graph(allocr, gf);
|
||||||
|
|
||||||
|
if (ggml_backend_is_cpu(model.backend)) {
|
||||||
|
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: enable GPU support when support for GGML_OP_SQRT is added
|
||||||
|
//#ifdef GGML_USE_METAL
|
||||||
|
// if (ggml_backend_is_metal(model.backend)) {
|
||||||
|
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
|
||||||
|
// }
|
||||||
|
//#endif
|
||||||
|
|
||||||
|
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
|
||||||
|
if (res == GGML_STATUS_SUCCESS) {
|
||||||
|
auto extract_i = [](std::string prefix, std::string str) -> int {
|
||||||
|
int i = -1;
|
||||||
|
if (str.rfind(prefix, 0) == 0) {
|
||||||
|
sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
};
|
||||||
|
result.calculated_square = NULL;
|
||||||
|
result.eigenvectors.clear();
|
||||||
|
result.distances.clear();
|
||||||
|
result.eigenvectors.resize(params.n_batch);
|
||||||
|
result.distances.resize(params.n_batch);
|
||||||
|
// get output nodes
|
||||||
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||||
|
auto node = gf->nodes[i];
|
||||||
|
int iter = -1;
|
||||||
|
// find b_tensor (without copying data from device)
|
||||||
|
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||||
|
result.eigenvectors[iter] = node;
|
||||||
|
}
|
||||||
|
// find distances, then copy data from device
|
||||||
|
if ((iter = extract_i("distance_", node->name)) > -1) {
|
||||||
|
float d;
|
||||||
|
ggml_backend_tensor_get(node, &d, 0, sizeof(float));
|
||||||
|
result.distances[iter] = d;
|
||||||
|
// std::cout << node->name << " = " << d << "\n";
|
||||||
|
}
|
||||||
|
// find tmp_square if it exists (without copying data from device)
|
||||||
|
if (std::string(node->name) == "tmp_square") {
|
||||||
|
result.calculated_square = node;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void power_iteration(
|
||||||
|
const struct pca_params & params,
|
||||||
|
struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
|
||||||
|
struct ggml_tensor * output) {
|
||||||
|
//printf("in power iteration\n");
|
||||||
|
struct pca_model model(input);
|
||||||
|
|
||||||
|
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
|
||||||
|
struct pca_result result;
|
||||||
|
struct ggml_tensor * last_eigenvector = NULL;
|
||||||
|
|
||||||
|
int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
|
||||||
|
for (int iter = 0; iter < n_iters; ++iter) {
|
||||||
|
bool calc_square = (iter == 0); // only need to calculate square for first iteration
|
||||||
|
struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
|
||||||
|
// ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
|
||||||
|
compute_piter(params, model, gf, allocr, result);
|
||||||
|
|
||||||
|
for (size_t k = 0; k < result.distances.size(); ++k) {
|
||||||
|
last_eigenvector = result.eigenvectors[k];
|
||||||
|
if (result.distances[k] < params.tolerance) {
|
||||||
|
break; // done
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (calc_square) {
|
||||||
|
// copy and store the square matrix if needed
|
||||||
|
GGML_ASSERT(result.calculated_square != NULL);
|
||||||
|
ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// copy last eigen vector and store as input for next iteration
|
||||||
|
GGML_ASSERT(last_eigenvector != NULL);
|
||||||
|
ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
||||||
|
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
// get output tensor
|
||||||
|
GGML_ASSERT(last_eigenvector);
|
||||||
|
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
||||||
|
//print_debug_tensor(output);
|
||||||
|
ggml_gallocr_free(allocr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void run_pca(
|
||||||
|
struct pca_params & params,
|
||||||
|
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
|
||||||
|
const std::vector<struct ggml_tensor *> & v_output) {
|
||||||
|
printf("%s: Running PCA...\n", __func__);
|
||||||
|
for (size_t il = 0; il < v_input.size(); ++il) {
|
||||||
|
|
||||||
|
// prepare output vector
|
||||||
|
struct ggml_tensor * ctrl_out = v_output[il];
|
||||||
|
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
||||||
|
|
||||||
|
// run power_iteration
|
||||||
|
params.i_layer = il;
|
||||||
|
params.n_layers = v_input.size();
|
||||||
|
power_iteration(params, v_input[il], ctrl_out);
|
||||||
|
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
1
examples/cvector-generator/positive.txt
Normal file
1
examples/cvector-generator/positive.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
[INST] Act like a person who is extremely happy. [/INST]
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET embedding)
|
set(TARGET llama-embedding)
|
||||||
add_executable(${TARGET} embedding.cpp)
|
add_executable(${TARGET} embedding.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
### Unix-based systems (Linux, macOS, etc.):
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows:
|
### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
set(TARGET eval-callback)
|
set(TARGET llama-eval-callback)
|
||||||
add_executable(${TARGET} eval-callback.cpp)
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
set(TEST_TARGET test-eval-callback)
|
set(TEST_TARGET test-eval-callback)
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||||
|
|
|
@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
eval-callback \
|
llama-eval-callback \
|
||||||
--hf-repo ggml-org/models \
|
--hf-repo ggml-org/models \
|
||||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
--model phi-2-q4_0.gguf \
|
--model phi-2-q4_0.gguf \
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET export-lora)
|
set(TARGET llama-export-lora)
|
||||||
add_executable(${TARGET} export-lora.cpp)
|
add_executable(${TARGET} export-lora.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
Apply LORA adapters to base model and export the resulting model.
|
Apply LORA adapters to base model and export the resulting model.
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: export-lora [options]
|
usage: llama-export-lora [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -17,7 +17,7 @@ options:
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/export-lora \
|
./bin/llama-export-lora \
|
||||||
-m open-llama-3b-v2-q8_0.gguf \
|
-m open-llama-3b-v2-q8_0.gguf \
|
||||||
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
||||||
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET finetune)
|
set(TARGET llama-finetune)
|
||||||
add_executable(${TARGET} finetune.cpp)
|
add_executable(${TARGET} finetune.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -7,7 +7,7 @@ Basic usage instructions:
|
||||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||||
|
|
||||||
# finetune LORA adapter
|
# finetune LORA adapter
|
||||||
./bin/finetune \
|
./bin/llama-finetune \
|
||||||
--model-base open-llama-3b-v2-q8_0.gguf \
|
--model-base open-llama-3b-v2-q8_0.gguf \
|
||||||
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
||||||
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
||||||
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
--use-checkpointing
|
--use-checkpointing
|
||||||
|
|
||||||
# predict
|
# predict
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||||
|
@ -38,14 +38,14 @@ After 10 more iterations:
|
||||||
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
||||||
|
|
||||||
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
||||||
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
|
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
|
||||||
|
|
||||||
In `main` you can also load multiple LORA adapters, which will then be mixed together.
|
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
|
||||||
|
|
||||||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||||
```
|
```
|
||||||
|
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
|
||||||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
cd ../..
|
cd ../..
|
||||||
|
|
||||||
EXE="./finetune"
|
EXE="./llama-finetune"
|
||||||
|
|
||||||
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
||||||
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET gbnf-validator)
|
set(TARGET llama-gbnf-validator)
|
||||||
add_executable(${TARGET} gbnf-validator.cpp)
|
add_executable(${TARGET} gbnf-validator.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -7,6 +7,8 @@
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <sstream>
|
||||||
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
@ -69,13 +71,14 @@ int main(int argc, char** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fseek(grammar_file, 0, SEEK_END);
|
std::string grammar_str;
|
||||||
size_t grammar_size = ftell(grammar_file);
|
{
|
||||||
fseek(grammar_file, 0, SEEK_SET);
|
std::ifstream grammar_file(grammar_filename);
|
||||||
|
GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
|
||||||
std::string grammar_str(grammar_size, ' ');
|
std::stringstream buffer;
|
||||||
fread(&grammar_str[0], 1, grammar_size, grammar_file);
|
buffer << grammar_file.rdbuf();
|
||||||
fclose(grammar_file);
|
grammar_str = buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the GBNF grammar
|
// Parse the GBNF grammar
|
||||||
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
||||||
|
@ -100,20 +103,15 @@ int main(int argc, char** argv) {
|
||||||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
|
|
||||||
// Read the input file
|
// Read the input file
|
||||||
FILE* input_file = fopen(input_filename.c_str(), "r");
|
std::string input_str;
|
||||||
if (!input_file) {
|
{
|
||||||
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
|
std::ifstream input_file(input_filename);
|
||||||
return 1;
|
GGML_ASSERT(input_file.is_open() && "Failed to open input file");
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << input_file.rdbuf();
|
||||||
|
input_str = buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
fseek(input_file, 0, SEEK_END);
|
|
||||||
size_t input_size = ftell(input_file);
|
|
||||||
fseek(input_file, 0, SEEK_SET);
|
|
||||||
|
|
||||||
std::string input_str(input_size, ' ');
|
|
||||||
fread(&input_str[0], 1, input_size, input_file);
|
|
||||||
fclose(input_file);
|
|
||||||
|
|
||||||
// Validate the input string against the grammar
|
// Validate the input string against the grammar
|
||||||
size_t error_pos;
|
size_t error_pos;
|
||||||
std::string error_msg;
|
std::string error_msg;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gguf-split)
|
set(TARGET llama-gguf-split)
|
||||||
add_executable(${TARGET} gguf-split.cpp)
|
add_executable(${TARGET} gguf-split.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -18,8 +18,8 @@ fi
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
SPLIT=$1/gguf-split
|
SPLIT=$1/llama-gguf-split
|
||||||
MAIN=$1/main
|
MAIN=$1/llama-cli
|
||||||
WORK_PATH=$TMP_DIR/gguf-split
|
WORK_PATH=$TMP_DIR/gguf-split
|
||||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gguf)
|
set(TARGET llama-gguf)
|
||||||
add_executable(${TARGET} gguf.cpp)
|
add_executable(${TARGET} gguf.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,15 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#
|
|
||||||
# Temporary script - will be removed in the future
|
|
||||||
#
|
|
||||||
|
|
||||||
cd `dirname $0`
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
./main --color --instruct --threads 4 \
|
|
||||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
|
||||||
--file ./prompts/alpaca.txt \
|
|
||||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
|
||||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
|
||||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gritlm)
|
set(TARGET llama-gritlm)
|
||||||
add_executable(${TARGET} gritlm.cpp)
|
add_executable(${TARGET} gritlm.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
|
||||||
|
|
||||||
Run the example using the downloaded model:
|
Run the example using the downloaded model:
|
||||||
```console
|
```console
|
||||||
$ ./gritlm -m models/gritlm-7b_q4_1.gguf
|
$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
|
||||||
|
|
||||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
||||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET imatrix)
|
set(TARGET llama-imatrix)
|
||||||
add_executable(${TARGET} imatrix.cpp)
|
add_executable(${TARGET} imatrix.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```
|
```
|
||||||
./imatrix \
|
./llama-imatrix \
|
||||||
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
||||||
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
||||||
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
||||||
|
@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
LLAMA_CUDA=1 make -j
|
LLAMA_CUDA=1 make -j
|
||||||
|
|
||||||
# generate importance matrix (imatrix.dat)
|
# generate importance matrix (imatrix.dat)
|
||||||
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
|
||||||
# use the imatrix to perform a Q4_K_M quantization
|
# use the imatrix to perform a Q4_K_M quantization
|
||||||
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||||
```
|
```
|
||||||
|
|
|
@ -218,20 +218,64 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
||||||
fname += std::to_string(ncall);
|
fname += std::to_string(ncall);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// avoid writing imatrix entries that do not have full data
|
||||||
|
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
|
||||||
|
|
||||||
|
int n_entries = 0;
|
||||||
|
std::vector<std::string> to_store;
|
||||||
|
|
||||||
|
bool is_first = true; // for printing
|
||||||
|
for (const auto & kv : m_stats) {
|
||||||
|
const int n_all = kv.second.counts.size();
|
||||||
|
|
||||||
|
if (n_all == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_zeros = 0;
|
||||||
|
for (const int c : kv.second.counts) {
|
||||||
|
if (c == 0) {
|
||||||
|
n_zeros++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_zeros != 0 && is_first) {
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
is_first = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_zeros == n_all) {
|
||||||
|
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_zeros > 0) {
|
||||||
|
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
n_entries++;
|
||||||
|
to_store.push_back(kv.first);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (to_store.size() < m_stats.size()) {
|
||||||
|
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
||||||
|
}
|
||||||
|
|
||||||
std::ofstream out(fname, std::ios::binary);
|
std::ofstream out(fname, std::ios::binary);
|
||||||
int n_entries = m_stats.size();
|
|
||||||
out.write((const char *) &n_entries, sizeof(n_entries));
|
out.write((const char *) &n_entries, sizeof(n_entries));
|
||||||
for (const auto & p : m_stats) {
|
for (const auto & name : to_store) {
|
||||||
int len = p.first.size();
|
const auto & stat = m_stats.at(name);
|
||||||
|
int len = name.size();
|
||||||
out.write((const char *) &len, sizeof(len));
|
out.write((const char *) &len, sizeof(len));
|
||||||
out.write(p.first.c_str(), len);
|
out.write(name.c_str(), len);
|
||||||
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
|
||||||
int nval = p.second.values.size();
|
int nval = stat.values.size();
|
||||||
out.write((const char *) &nval, sizeof(nval));
|
out.write((const char *) &nval, sizeof(nval));
|
||||||
if (nval > 0) {
|
if (nval > 0) {
|
||||||
std::vector<float> tmp(nval);
|
std::vector<float> tmp(nval);
|
||||||
for (int i = 0; i < nval; i++) {
|
for (int i = 0; i < nval; i++) {
|
||||||
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
|
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
|
||||||
}
|
}
|
||||||
out.write((const char*)tmp.data(), nval*sizeof(float));
|
out.write((const char*)tmp.data(), nval*sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET infill)
|
set(TARGET llama-infill)
|
||||||
add_executable(${TARGET} infill.cpp)
|
add_executable(${TARGET} infill.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||||
```
|
```
|
||||||
|
|
|
@ -21,7 +21,7 @@ counter=1
|
||||||
echo 'Running'
|
echo 'Running'
|
||||||
while IFS= read -r question
|
while IFS= read -r question
|
||||||
do
|
do
|
||||||
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||||
echo $counter
|
echo $counter
|
||||||
echo "Current Question: $question"
|
echo "Current Question: $question"
|
||||||
eval "$exe_cmd"
|
eval "$exe_cmd"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Usage:
|
# Usage:
|
||||||
#! ./server -m some-model.gguf &
|
#! ./llama-server -m some-model.gguf &
|
||||||
#! pip install pydantic
|
#! pip install pydantic
|
||||||
#! python json-schema-pydantic-example.py
|
#! python json-schema-pydantic-example.py
|
||||||
|
|
||||||
|
|
|
@ -29,9 +29,8 @@ class BuiltinRule:
|
||||||
self.content = content
|
self.content = content
|
||||||
self.deps = deps or []
|
self.deps = deps or []
|
||||||
|
|
||||||
# whitespace is constrained to a single space char to prevent model "running away" in
|
# Constraining spaces to prevent model "running away".
|
||||||
# whitespace. Also maybe improves generation quality?
|
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
|
||||||
SPACE_RULE = '" "?'
|
|
||||||
|
|
||||||
PRIMITIVE_RULES = {
|
PRIMITIVE_RULES = {
|
||||||
'boolean' : BuiltinRule('("true" | "false") space', []),
|
'boolean' : BuiltinRule('("true" | "false") space', []),
|
||||||
|
@ -43,7 +42,7 @@ PRIMITIVE_RULES = {
|
||||||
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
|
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
|
||||||
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
|
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
|
||||||
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
|
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
|
||||||
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
|
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
|
||||||
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
|
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
|
||||||
'null' : BuiltinRule('"null" space', []),
|
'null' : BuiltinRule('"null" space', []),
|
||||||
}
|
}
|
||||||
|
@ -524,7 +523,7 @@ class SchemaConverter:
|
||||||
def main(args_in = None):
|
def main(args_in = None):
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='''
|
description='''
|
||||||
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
|
Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
|
||||||
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
||||||
added in the future.
|
added in the future.
|
||||||
''',
|
''',
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# llama.cpp/example/llama-bench
|
# llama.cpp/examples/llama-bench
|
||||||
|
|
||||||
Performance testing tool for llama.cpp.
|
Performance testing tool for llama.cpp.
|
||||||
|
|
||||||
|
|
|
@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
params.output_format = cmd_params_defaults.output_format;
|
params.output_format = cmd_params_defaults.output_format;
|
||||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||||
params.reps = cmd_params_defaults.reps;
|
params.reps = cmd_params_defaults.reps;
|
||||||
|
params.numa = cmd_params_defaults.numa;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
|
@ -713,7 +714,6 @@ struct test {
|
||||||
static const bool kompute;
|
static const bool kompute;
|
||||||
static const bool metal;
|
static const bool metal;
|
||||||
static const bool sycl;
|
static const bool sycl;
|
||||||
static const bool rpc;
|
|
||||||
static const bool gpu_blas;
|
static const bool gpu_blas;
|
||||||
static const bool blas;
|
static const bool blas;
|
||||||
static const std::string cpu_info;
|
static const std::string cpu_info;
|
||||||
|
@ -725,6 +725,7 @@ struct test {
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_ubatch;
|
int n_ubatch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
bool has_rpc;
|
||||||
ggml_type type_k;
|
ggml_type type_k;
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
|
@ -750,6 +751,7 @@ struct test {
|
||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_ubatch = inst.n_ubatch;
|
n_ubatch = inst.n_ubatch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
|
has_rpc = !inst.rpc_servers.empty();
|
||||||
type_k = inst.type_k;
|
type_k = inst.type_k;
|
||||||
type_v = inst.type_v;
|
type_v = inst.type_v;
|
||||||
n_gpu_layers = inst.n_gpu_layers;
|
n_gpu_layers = inst.n_gpu_layers;
|
||||||
|
@ -809,9 +811,6 @@ struct test {
|
||||||
if (sycl) {
|
if (sycl) {
|
||||||
return GGML_SYCL_NAME;
|
return GGML_SYCL_NAME;
|
||||||
}
|
}
|
||||||
if (rpc) {
|
|
||||||
return "RPC";
|
|
||||||
}
|
|
||||||
if (gpu_blas) {
|
if (gpu_blas) {
|
||||||
return "GPU BLAS";
|
return "GPU BLAS";
|
||||||
}
|
}
|
||||||
|
@ -881,7 +880,7 @@ struct test {
|
||||||
std::vector<std::string> values = {
|
std::vector<std::string> values = {
|
||||||
build_commit, std::to_string(build_number),
|
build_commit, std::to_string(build_number),
|
||||||
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
||||||
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||||
|
@ -915,7 +914,6 @@ const bool test::metal = !!ggml_cpu_has_metal();
|
||||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||||
const bool test::blas = !!ggml_cpu_has_blas();
|
const bool test::blas = !!ggml_cpu_has_blas();
|
||||||
const bool test::sycl = !!ggml_cpu_has_sycl();
|
const bool test::sycl = !!ggml_cpu_has_sycl();
|
||||||
const bool test::rpc = !!ggml_cpu_has_rpc();
|
|
||||||
const std::string test::cpu_info = get_cpu_info();
|
const std::string test::cpu_info = get_cpu_info();
|
||||||
const std::string test::gpu_info = get_gpu_info();
|
const std::string test::gpu_info = get_gpu_info();
|
||||||
|
|
||||||
|
@ -1033,6 +1031,27 @@ struct markdown_printer : public printer {
|
||||||
if (field == "n_gpu_layers") {
|
if (field == "n_gpu_layers") {
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
if (field == "n_threads") {
|
||||||
|
return 7;
|
||||||
|
}
|
||||||
|
if (field == "n_batch") {
|
||||||
|
return 7;
|
||||||
|
}
|
||||||
|
if (field == "n_ubatch") {
|
||||||
|
return 8;
|
||||||
|
}
|
||||||
|
if (field == "type_k" || field == "type_v") {
|
||||||
|
return 6;
|
||||||
|
}
|
||||||
|
if (field == "split_mode") {
|
||||||
|
return 5;
|
||||||
|
}
|
||||||
|
if (field == "flash_attn") {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
if (field == "use_mmap") {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
if (field == "test") {
|
if (field == "test") {
|
||||||
return 13;
|
return 13;
|
||||||
}
|
}
|
||||||
|
@ -1160,6 +1179,9 @@ struct markdown_printer : public printer {
|
||||||
value = buf;
|
value = buf;
|
||||||
} else if (field == "backend") {
|
} else if (field == "backend") {
|
||||||
value = test::get_backend();
|
value = test::get_backend();
|
||||||
|
if (t.has_rpc) {
|
||||||
|
value += "+RPC";
|
||||||
|
}
|
||||||
} else if (field == "test") {
|
} else if (field == "test") {
|
||||||
if (t.n_prompt > 0 && t.n_gen == 0) {
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
||||||
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#
|
|
||||||
# Temporary script - will be removed in the future
|
|
||||||
#
|
|
||||||
|
|
||||||
cd `dirname $0`
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
|
||||||
--color \
|
|
||||||
--ctx_size 2048 \
|
|
||||||
-n -1 \
|
|
||||||
-ins -b 256 \
|
|
||||||
--top_k 10000 \
|
|
||||||
--temp 0.2 \
|
|
||||||
--repeat_penalty 1.1 \
|
|
||||||
-t 8
|
|
|
@ -1,18 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#
|
|
||||||
# Temporary script - will be removed in the future
|
|
||||||
#
|
|
||||||
|
|
||||||
cd `dirname $0`
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
|
||||||
--color \
|
|
||||||
--ctx_size 2048 \
|
|
||||||
-n -1 \
|
|
||||||
-ins -b 256 \
|
|
||||||
--top_k 10000 \
|
|
||||||
--temp 0.2 \
|
|
||||||
--repeat_penalty 1.1 \
|
|
||||||
-t 8
|
|
|
@ -30,8 +30,9 @@ if(TARGET BUILD_INFO)
|
||||||
add_dependencies(llava BUILD_INFO)
|
add_dependencies(llava BUILD_INFO)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(TARGET llava-cli)
|
set(TARGET llama-llava-cli)
|
||||||
add_executable(llava-cli llava-cli.cpp)
|
add_executable(${TARGET} llava-cli.cpp)
|
||||||
install(TARGETS llava-cli RUNTIME)
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||||
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_features(llava PRIVATE cxx_std_11)
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue