Merge branch 'ggerganov:master' into bitnet
This commit is contained in:
commit
abcdc5033a
259 changed files with 43256 additions and 36986 deletions
|
@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
||||||
stage('Running llama.cpp'){
|
stage('Running llama.cpp'){
|
||||||
sh'''#!/bin/bash
|
sh'''#!/bin/bash
|
||||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||||
cat llama_log.txt # Printing results
|
cat llama_log.txt # Printing results
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,13 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV LLAMA_CUDA=1
|
||||||
|
|
||||||
RUN make -j$(nproc) main
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
26
.devops/llama-cli-intel.Dockerfile
Normal file
26
.devops/llama-cli-intel.Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
RUN make -j$(nproc) main
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/main" ]
|
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
||||||
cmake --build build --config Release --target main
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/main /main && \
|
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -9,15 +9,15 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN make -j$(nproc) main
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llamaclblast
|
%{_bindir}/llama-clblast-cli
|
||||||
%{_bindir}/llamaclblastserver
|
%{_bindir}/llama-clblast-server
|
||||||
%{_bindir}/llamaclblastsimple
|
%{_bindir}/llama-clblast-simple
|
||||||
/usr/lib/systemd/system/llamaclblast.service
|
/usr/lib/systemd/system/llamaclblast.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llamacppcuda
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llamacppcuda
|
%{_bindir}/llama-cuda-cli
|
||||||
%{_bindir}/llamacppcudaserver
|
%{_bindir}/llama-cuda-server
|
||||||
%{_bindir}/llamacppcudasimple
|
%{_bindir}/llama-cuda-simple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacuda.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -38,9 +38,9 @@ make -j
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llama
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamaserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||||
|
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -69,9 +69,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama
|
%{_bindir}/llama-cli
|
||||||
%{_bindir}/llamaserver
|
%{_bindir}/llama-server
|
||||||
%{_bindir}/llamasimple
|
%{_bindir}/llama-simple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
29
.devops/llama-server-intel.Dockerfile
Normal file
29
.devops/llama-server-intel.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/server" ]
|
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -19,13 +19,13 @@ RUN apt-get update && \
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
cmake --build build --config Release --target server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/server /server && \
|
RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -11,15 +11,15 @@ COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,34 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "LLAMA_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target main
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/main /main
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
|
|
@ -6,11 +6,11 @@
|
||||||
let
|
let
|
||||||
inherit (config.packages) default;
|
inherit (config.packages) default;
|
||||||
binaries = [
|
binaries = [
|
||||||
"llama"
|
"llama-cli"
|
||||||
"llama-embedding"
|
"llama-embedding"
|
||||||
"llama-server"
|
"llama-server"
|
||||||
"quantize"
|
"llama-quantize"
|
||||||
"train-text-from-scratch"
|
"llama-train-text-from-scratch"
|
||||||
];
|
];
|
||||||
mkApp = name: {
|
mkApp = name: {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
|
|
@ -243,8 +243,6 @@ effectiveStdenv.mkDerivation (
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
|
|
||||||
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
|
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/llama.h $out/include/
|
cp $src/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
@ -294,7 +292,7 @@ effectiveStdenv.mkDerivation (
|
||||||
license = lib.licenses.mit;
|
license = lib.licenses.mit;
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
mainProgram = "llama";
|
mainProgram = "llama-cli";
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
|
|
|
@ -1,45 +0,0 @@
|
||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
|
||||||
echo "LLAMA_SYCL_F16 is set" && \
|
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
|
||||||
fi && \
|
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
|
||||||
cmake --build build --config Release --target server
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/server /server
|
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
|
|
@ -10,11 +10,11 @@ shift
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
python3 ./convert-hf-to-gguf.py "$@"
|
python3 ./convert-hf-to-gguf.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./quantize "$@"
|
./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./main "$@"
|
./llama-cli "$@"
|
||||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||||
./finetune "$@"
|
./llama-finetune "$@"
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
./server "$@"
|
./llama-server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
|
|
|
@ -12,8 +12,8 @@ build*/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
/main
|
/llama-cli
|
||||||
/quantize
|
/llama-quantize
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
|
@ -26,3 +26,6 @@ indent_size = 2
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
|
[examples/cvector-generator/*.txt]
|
||||||
|
insert_final_newline = unset
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
2
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./main --version
|
$./llama-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
2
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./main --version
|
$./llama-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
2
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./main --version
|
$./llama-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
2
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./main --version
|
$./llama-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
1
.github/labeler.yml
vendored
1
.github/labeler.yml
vendored
|
@ -42,7 +42,6 @@ build:
|
||||||
- cmake/**
|
- cmake/**
|
||||||
- CMakeLists.txt
|
- CMakeLists.txt
|
||||||
- CMakePresets.json
|
- CMakePresets.json
|
||||||
- codecov.yml
|
|
||||||
examples:
|
examples:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file: examples/**
|
- any-glob-to-any-file: examples/**
|
||||||
|
|
12
.github/pull_request_template.md
vendored
12
.github/pull_request_template.md
vendored
|
@ -1,5 +1,7 @@
|
||||||
- Self Reported Review Complexity:
|
|
||||||
- [ ] Review Complexity : Low
|
|
||||||
- [ ] Review Complexity : Medium
|
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
||||||
- [ ] Review Complexity : High
|
- Self-reported review complexity:
|
||||||
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
- [ ] Low
|
||||||
|
- [ ] Medium
|
||||||
|
- [ ] High
|
||||||
|
|
2
.github/workflows/bench.yml
vendored
2
.github/workflows/bench.yml
vendored
|
@ -119,7 +119,7 @@ jobs:
|
||||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
-DLLAMA_FATAL_WARNINGS=OFF \
|
||||||
-DLLAMA_ALL_WARNINGS=OFF \
|
-DLLAMA_ALL_WARNINGS=OFF \
|
||||||
-DCMAKE_BUILD_TYPE=Release;
|
-DCMAKE_BUILD_TYPE=Release;
|
||||||
cmake --build build --config Release -j $(nproc) --target server
|
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Download the dataset
|
- name: Download the dataset
|
||||||
id: download_dataset
|
id: download_dataset
|
||||||
|
|
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
|
@ -84,7 +84,7 @@ jobs:
|
||||||
name: llama-bin-macos-arm64.zip
|
name: llama-bin-macos-arm64.zip
|
||||||
|
|
||||||
macOS-latest-cmake-x64:
|
macOS-latest-cmake-x64:
|
||||||
runs-on: macos-latest
|
runs-on: macos-12
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -103,12 +103,10 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
|
@ -241,8 +239,8 @@ jobs:
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||||
echo "Fetch llama2c model"
|
echo "Fetch llama2c model"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||||
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||||
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
|
40
.github/workflows/code-coverage.yml
vendored
40
.github/workflows/code-coverage.yml
vendored
|
@ -1,40 +0,0 @@
|
||||||
name: Code Coverage
|
|
||||||
on: [push, pull_request]
|
|
||||||
|
|
||||||
env:
|
|
||||||
GGML_NLOOP: 3
|
|
||||||
GGML_N_THREADS: 1
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
run:
|
|
||||||
runs-on: ubuntu-20.04
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install build-essential gcc-8 lcov
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
|
|
||||||
|
|
||||||
- name: Run tests
|
|
||||||
run: CC=gcc-8 make test
|
|
||||||
|
|
||||||
- name: Generate coverage report
|
|
||||||
run: |
|
|
||||||
make coverage
|
|
||||||
make lcov-report
|
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
|
||||||
uses: codecov/codecov-action@v3
|
|
||||||
env:
|
|
||||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
with:
|
|
||||||
files: lcov-report/coverage.info
|
|
16
.github/workflows/docker.yml
vendored
16
.github/workflows/docker.yml
vendored
|
@ -30,20 +30,20 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||||
# have disabled them for now until the reason why
|
# have disabled them for now until the reason why
|
||||||
# is understood.
|
# is understood.
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
18
.github/workflows/server.yml
vendored
18
.github/workflows/server.yml
vendored
|
@ -87,8 +87,22 @@ jobs:
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Build (no OpenMP)
|
||||||
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_NATIVE=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DLLAMA_CURL=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DLLAMA_OPENMP=OFF ;
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DLLAMA_NATIVE=OFF \
|
||||||
|
@ -96,7 +110,7 @@ jobs:
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
|
@ -136,7 +150,7 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
id: setup_python
|
id: setup_python
|
||||||
|
|
152
.gitignore
vendored
152
.gitignore
vendored
|
@ -1,129 +1,123 @@
|
||||||
*.o
|
# Extensions
|
||||||
|
|
||||||
*.a
|
*.a
|
||||||
*.so
|
*.bat
|
||||||
|
*.bin
|
||||||
|
*.dll
|
||||||
|
*.dot
|
||||||
|
*.etag
|
||||||
|
*.exe
|
||||||
|
*.gcda
|
||||||
|
*.gcno
|
||||||
|
*.gcov
|
||||||
*.gguf
|
*.gguf
|
||||||
*.gguf.json
|
*.gguf.json
|
||||||
*.bin
|
|
||||||
*.exe
|
|
||||||
*.dll
|
|
||||||
*.log
|
|
||||||
*.gcov
|
|
||||||
*.gcno
|
|
||||||
*.gcda
|
|
||||||
*.dot
|
|
||||||
*.bat
|
|
||||||
*.tmp
|
|
||||||
*.metallib
|
|
||||||
*.etag
|
|
||||||
*.lastModified
|
*.lastModified
|
||||||
.DS_Store
|
*.log
|
||||||
.build/
|
*.metallib
|
||||||
|
*.o
|
||||||
|
*.so
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# IDE / OS
|
||||||
|
|
||||||
.cache/
|
.cache/
|
||||||
.ccls-cache/
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
|
.DS_Store
|
||||||
.envrc
|
.envrc
|
||||||
|
.idea/
|
||||||
.swiftpm
|
.swiftpm
|
||||||
.venv
|
|
||||||
.clang-tidy
|
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
nppBackup
|
||||||
|
|
||||||
ggml-metal-embed.metal
|
|
||||||
|
|
||||||
lcov-report/
|
# Coverage
|
||||||
|
|
||||||
gcovr-report/
|
gcovr-report/
|
||||||
|
lcov-report/
|
||||||
|
|
||||||
|
# Build Artifacts
|
||||||
|
|
||||||
tags
|
tags
|
||||||
|
.build/
|
||||||
build*
|
build*
|
||||||
|
!build-info.cmake
|
||||||
|
!build-info.cpp.in
|
||||||
|
!build-info.sh
|
||||||
!build.zig
|
!build.zig
|
||||||
cmake-build-*
|
/libllama.so
|
||||||
|
/llama-*
|
||||||
android-ndk-*
|
android-ndk-*
|
||||||
|
arm_neon.h
|
||||||
|
cmake-build-*
|
||||||
|
CMakeSettings.json
|
||||||
|
compile_commands.json
|
||||||
|
ggml-metal-embed.metal
|
||||||
|
llama-batched-swift
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
# CI
|
||||||
|
|
||||||
|
!.github/workflows/*.yml
|
||||||
|
|
||||||
|
# Models
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
models-mnt
|
models-mnt
|
||||||
|
!models/.editorconfig
|
||||||
|
!models/ggml-vocab-*.gguf*
|
||||||
|
|
||||||
/Pipfile
|
# Zig
|
||||||
/baby-llama
|
|
||||||
/beam-search
|
|
||||||
/benchmark-matmult
|
|
||||||
/convert-llama2c-to-ggml
|
|
||||||
/embd-input-test
|
|
||||||
/embedding
|
|
||||||
/eval-callback
|
|
||||||
/gguf
|
|
||||||
/gguf-llama-simple
|
|
||||||
/gguf-split
|
|
||||||
/gritlm
|
|
||||||
/imatrix
|
|
||||||
/infill
|
|
||||||
/libllama.so
|
|
||||||
/llama-bench
|
|
||||||
/llava-cli
|
|
||||||
/lookahead
|
|
||||||
/lookup
|
|
||||||
/lookup-create
|
|
||||||
/lookup-merge
|
|
||||||
/lookup-stats
|
|
||||||
/main
|
|
||||||
/metal
|
|
||||||
/passkey
|
|
||||||
/perplexity
|
|
||||||
/q8dot
|
|
||||||
/quantize
|
|
||||||
/quantize-stats
|
|
||||||
/result
|
|
||||||
/save-load-state
|
|
||||||
/server
|
|
||||||
/simple
|
|
||||||
/batched
|
|
||||||
/batched-bench
|
|
||||||
/export-lora
|
|
||||||
/finetune
|
|
||||||
/retrieval
|
|
||||||
/speculative
|
|
||||||
/parallel
|
|
||||||
/train-text-from-scratch
|
|
||||||
/tokenize
|
|
||||||
/vdot
|
|
||||||
/common/build-info.cpp
|
|
||||||
arm_neon.h
|
|
||||||
compile_commands.json
|
|
||||||
CMakeSettings.json
|
|
||||||
|
|
||||||
__pycache__
|
|
||||||
dist
|
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
|
||||||
ppl-*.txt
|
ppl-*.txt
|
||||||
qnt-*.txt
|
qnt-*.txt
|
||||||
perf-*.txt
|
perf-*.txt
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
|
examples/server/*.css.hpp
|
||||||
examples/server/*.html.hpp
|
examples/server/*.html.hpp
|
||||||
examples/server/*.js.hpp
|
examples/server/*.js.hpp
|
||||||
examples/server/*.mjs.hpp
|
examples/server/*.mjs.hpp
|
||||||
examples/server/*.css.hpp
|
!build_64.sh
|
||||||
|
!examples/*.bat
|
||||||
|
!examples/*/*.kts
|
||||||
|
!examples/*/*/*.kts
|
||||||
|
!examples/sycl/*.bat
|
||||||
|
!examples/sycl/*.sh
|
||||||
|
|
||||||
|
# Python
|
||||||
|
|
||||||
|
__pycache__
|
||||||
|
.venv
|
||||||
|
/Pipfile
|
||||||
|
dist
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
nppBackup
|
|
||||||
|
|
||||||
# Test binaries
|
# Test binaries
|
||||||
/tests/test-grammar-parser
|
/tests/test-backend-ops
|
||||||
/tests/test-llama-grammar
|
|
||||||
/tests/test-double-float
|
/tests/test-double-float
|
||||||
/tests/test-grad0
|
/tests/test-grad0
|
||||||
|
/tests/test-grammar-parser
|
||||||
|
/tests/test-llama-grammar
|
||||||
/tests/test-opt
|
/tests/test-opt
|
||||||
/tests/test-quantize-fns
|
/tests/test-quantize-fns
|
||||||
/tests/test-quantize-perf
|
/tests/test-quantize-perf
|
||||||
|
/tests/test-rope
|
||||||
/tests/test-sampling
|
/tests/test-sampling
|
||||||
/tests/test-tokenizer-0
|
/tests/test-tokenizer-0
|
||||||
/tests/test-tokenizer-1-spm
|
|
||||||
/tests/test-tokenizer-1-bpe
|
/tests/test-tokenizer-1-bpe
|
||||||
/tests/test-rope
|
/tests/test-tokenizer-1-spm
|
||||||
/tests/test-backend-ops
|
|
||||||
|
# Scripts
|
||||||
|
!/scripts/install-oneapi.bat
|
||||||
|
|
|
@ -39,8 +39,12 @@ endif()
|
||||||
|
|
||||||
if (APPLE)
|
if (APPLE)
|
||||||
set(LLAMA_METAL_DEFAULT ON)
|
set(LLAMA_METAL_DEFAULT ON)
|
||||||
|
set(LLAMA_BLAS_DEFAULT ON)
|
||||||
|
set(LLAMA_BLAS_VENDOR_DEFAULT "Apple")
|
||||||
else()
|
else()
|
||||||
set(LLAMA_METAL_DEFAULT OFF)
|
set(LLAMA_METAL_DEFAULT OFF)
|
||||||
|
set(LLAMA_BLAS_DEFAULT OFF)
|
||||||
|
set(LLAMA_BLAS_VENDOR_DEFAULT "Generic")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
||||||
|
@ -91,9 +95,10 @@ endif()
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT})
|
||||||
|
set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||||
|
"llama: BLAS library vendor")
|
||||||
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
|
||||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
|
@ -114,6 +119,7 @@ option(LLAMA_HIP_UMA "llama: use HIP unified memory arch
|
||||||
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||||
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
|
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
|
||||||
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
|
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
|
||||||
|
option(LLAMA_VULKAN_MEMORY_DEBUG "llama: enable Vulkan memory debug output" OFF)
|
||||||
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
|
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
|
||||||
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
|
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||||
|
@ -311,9 +317,9 @@ if (LLAMA_BLAS)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
set(BLA_STATIC ON)
|
set(BLA_STATIC ON)
|
||||||
endif()
|
endif()
|
||||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
||||||
set(BLA_SIZEOF_INTEGER 8)
|
# set(BLA_SIZEOF_INTEGER 8)
|
||||||
endif()
|
#endif()
|
||||||
|
|
||||||
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
||||||
find_package(BLAS)
|
find_package(BLAS)
|
||||||
|
@ -321,7 +327,7 @@ if (LLAMA_BLAS)
|
||||||
if (BLAS_FOUND)
|
if (BLAS_FOUND)
|
||||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||||
|
|
||||||
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
|
if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple"))
|
||||||
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
||||||
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||||
find_package(PkgConfig REQUIRED)
|
find_package(PkgConfig REQUIRED)
|
||||||
|
@ -374,12 +380,15 @@ if (LLAMA_BLAS)
|
||||||
|
|
||||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
add_compile_definitions(GGML_USE_BLAS)
|
||||||
|
|
||||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
||||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(GGML_HEADERS_BLAS ggml-blas.h)
|
||||||
|
set(GGML_SOURCES_BLAS ggml-blas.cpp)
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||||
else()
|
else()
|
||||||
|
@ -526,6 +535,10 @@ if (LLAMA_VULKAN)
|
||||||
add_compile_definitions(GGML_VULKAN_DEBUG)
|
add_compile_definitions(GGML_VULKAN_DEBUG)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_VULKAN_MEMORY_DEBUG)
|
||||||
|
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_VULKAN_VALIDATE)
|
if (LLAMA_VULKAN_VALIDATE)
|
||||||
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
||||||
endif()
|
endif()
|
||||||
|
@ -652,6 +665,7 @@ if (LLAMA_SYCL)
|
||||||
#todo: AOT
|
#todo: AOT
|
||||||
|
|
||||||
find_package(IntelSYCL REQUIRED)
|
find_package(IntelSYCL REQUIRED)
|
||||||
|
find_package(MKL REQUIRED)
|
||||||
|
|
||||||
message(STATUS "SYCL found")
|
message(STATUS "SYCL found")
|
||||||
|
|
||||||
|
@ -666,21 +680,22 @@ if (LLAMA_SYCL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_compile_options(-I./) #include DPCT
|
add_compile_options(-I./) #include DPCT
|
||||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
|
||||||
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(GGML_HEADERS_SYCL ggml-sycl.h)
|
set(GGML_HEADERS_SYCL ggml-sycl.h)
|
||||||
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
|
||||||
|
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||||
else()
|
else()
|
||||||
|
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||||
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||||
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
|
@ -1258,6 +1273,7 @@ add_library(ggml OBJECT
|
||||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||||
|
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
|
||||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -11,9 +11,21 @@
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "sycl-base",
|
||||||
|
"hidden": true,
|
||||||
|
"generator": "Ninja",
|
||||||
|
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
|
"CMAKE_CXX_COMPILER": "icx",
|
||||||
|
"LLAMA_SYCL": "ON",
|
||||||
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
|
}
|
||||||
|
},
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -35,15 +47,18 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
176
Makefile
176
Makefile
|
@ -1,8 +1,45 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
libllava.a \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
llama-baby-llama \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
llama-batched \
|
||||||
|
llama-batched-bench \
|
||||||
|
llama-bench \
|
||||||
|
llama-benchmark-matmult \
|
||||||
|
llama-cli \
|
||||||
|
llama-convert-llama2c-to-ggml \
|
||||||
|
llama-embedding \
|
||||||
|
llama-eval-callback \
|
||||||
|
llama-export-lora \
|
||||||
|
llama-finetune \
|
||||||
|
llama-gbnf-validator \
|
||||||
|
llama-gguf \
|
||||||
|
llama-gguf-split \
|
||||||
|
llama-gritlm \
|
||||||
|
llama-imatrix \
|
||||||
|
llama-infill \
|
||||||
|
llama-llava-cli \
|
||||||
|
llama-lookahead \
|
||||||
|
llama-lookup \
|
||||||
|
llama-lookup-create \
|
||||||
|
llama-lookup-merge \
|
||||||
|
llama-lookup-stats \
|
||||||
|
llama-parallel \
|
||||||
|
llama-passkey \
|
||||||
|
llama-perplexity \
|
||||||
|
llama-q8dot \
|
||||||
|
llama-quantize \
|
||||||
|
llama-quantize-stats \
|
||||||
|
llama-retrieval \
|
||||||
|
llama-save-load-state \
|
||||||
|
llama-server \
|
||||||
|
llama-simple \
|
||||||
|
llama-speculative \
|
||||||
|
llama-tokenize \
|
||||||
|
llama-train-text-from-scratch \
|
||||||
|
llama-vdot \
|
||||||
|
llama-cvector-generator \
|
||||||
|
tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
|
@ -404,10 +441,11 @@ ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac OS - include Accelerate framework.
|
# Mac OS - include Accelerate framework.
|
||||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
|
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
|
||||||
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
||||||
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
||||||
MK_LDFLAGS += -framework Accelerate
|
MK_LDFLAGS += -framework Accelerate
|
||||||
|
OBJS += ggml-blas.o
|
||||||
endif
|
endif
|
||||||
endif # LLAMA_NO_ACCELERATE
|
endif # LLAMA_NO_ACCELERATE
|
||||||
|
|
||||||
|
@ -418,21 +456,30 @@ ifndef LLAMA_NO_OPENMP
|
||||||
endif # LLAMA_NO_OPENMP
|
endif # LLAMA_NO_OPENMP
|
||||||
|
|
||||||
ifdef LLAMA_OPENBLAS
|
ifdef LLAMA_OPENBLAS
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
|
||||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
||||||
|
OBJS += ggml-blas.o
|
||||||
endif # LLAMA_OPENBLAS
|
endif # LLAMA_OPENBLAS
|
||||||
|
|
||||||
|
ifdef LLAMA_OPENBLAS64
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
|
||||||
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
|
||||||
|
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
|
||||||
|
OBJS += ggml-blas.o
|
||||||
|
endif # LLAMA_OPENBLAS64
|
||||||
|
|
||||||
|
ifdef LLAMA_BLIS
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||||
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||||
|
OBJS += ggml-blas.o
|
||||||
|
endif # LLAMA_BLIS
|
||||||
|
|
||||||
ifndef LLAMA_NO_LLAMAFILE
|
ifndef LLAMA_NO_LLAMAFILE
|
||||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||||
OBJS += sgemm.o
|
OBJS += sgemm.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_BLIS
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
|
||||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
|
||||||
endif # LLAMA_BLIS
|
|
||||||
|
|
||||||
ifdef LLAMA_RPC
|
ifdef LLAMA_RPC
|
||||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||||
OBJS += ggml-rpc.o
|
OBJS += ggml-rpc.o
|
||||||
|
@ -460,7 +507,7 @@ ifdef LLAMA_CUDA
|
||||||
CUDA_PATH ?= /usr/local/cuda
|
CUDA_PATH ?= /usr/local/cuda
|
||||||
endif
|
endif
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
||||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
||||||
OBJS += $(OBJS_CUDA_TEMP_INST)
|
OBJS += $(OBJS_CUDA_TEMP_INST)
|
||||||
|
@ -561,6 +608,10 @@ ifdef LLAMA_VULKAN_DEBUG
|
||||||
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
|
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_VULKAN_MEMORY_DEBUG
|
||||||
|
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_VULKAN_VALIDATE
|
ifdef LLAMA_VULKAN_VALIDATE
|
||||||
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
||||||
endif
|
endif
|
||||||
|
@ -740,6 +791,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
ggml-blas.o: ggml-blas.cpp ggml-blas.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
unicode.o: unicode.cpp unicode.h
|
unicode.o: unicode.cpp unicode.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
@ -777,7 +831,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||||
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||||
rm -vrf ggml-cuda/*.o
|
rm -vrf ggml-cuda/*.o
|
||||||
rm -vrf ggml-cuda/template-instances/*.o
|
rm -vrf ggml-cuda/template-instances/*.o
|
||||||
find examples pocs -type f -name "*.o" -delete
|
find examples pocs -type f -name "*.o" -delete
|
||||||
|
@ -793,62 +847,62 @@ clean:
|
||||||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||||
|
|
||||||
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./llama-cli -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
|
@ -861,23 +915,27 @@ examples/server/%.hpp: examples/server/public/% Makefile
|
||||||
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
||||||
) > $@
|
) > $@
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -888,55 +946,61 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||||
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||||
|
|
||||||
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
|
||||||
|
|
||||||
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -962,20 +1026,20 @@ build-info.o: common/build-info.cpp
|
||||||
|
|
||||||
tests: $(TEST_TARGETS)
|
tests: $(TEST_TARGETS)
|
||||||
|
|
||||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
run-benchmark-matmult: benchmark-matmult
|
run-benchmark-matmult: llama-benchmark-matmult
|
||||||
./$@
|
./$@
|
||||||
|
|
||||||
.PHONY: run-benchmark-matmult swift
|
.PHONY: run-benchmark-matmult swift
|
||||||
|
|
||||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# llama.cpp for SYCL
|
# llama.cpp for SYCL
|
||||||
|
|
||||||
- [Background](#background)
|
- [Background](#background)
|
||||||
|
- [Recommended Release](#recommended-release)
|
||||||
- [News](#news)
|
- [News](#news)
|
||||||
- [OS](#os)
|
- [OS](#os)
|
||||||
- [Hardware](#hardware)
|
- [Hardware](#hardware)
|
||||||
|
@ -31,8 +32,23 @@ When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneM
|
||||||
|
|
||||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||||
|
|
||||||
|
## Recommended Release
|
||||||
|
|
||||||
|
The SYCL backend would be broken by some PRs due to no online CI.
|
||||||
|
|
||||||
|
The following release is verified with good quality:
|
||||||
|
|
||||||
|
|Commit ID|Tag|Release|Verified Platform|
|
||||||
|
|-|-|-|-|
|
||||||
|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
||||||
|
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.5
|
||||||
|
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||||
|
- Arch Linux is verified successfully.
|
||||||
|
|
||||||
- 2024.4
|
- 2024.4
|
||||||
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
|
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
|
||||||
|
|
||||||
|
@ -77,7 +93,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- **Memory**
|
- **Memory**
|
||||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
|
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||||
|
|
||||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||||
|
|
||||||
|
@ -99,14 +115,14 @@ The docker build option is currently limited to *intel GPU* targets.
|
||||||
### Build image
|
### Build image
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
||||||
|
|
||||||
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
|
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||||
|
|
||||||
### Run container
|
### Run container
|
||||||
|
|
||||||
|
@ -275,7 +291,7 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./build/bin/ls-sycl-device
|
./build/bin/llama-ls-sycl-device
|
||||||
```
|
```
|
||||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
||||||
```
|
```
|
||||||
|
@ -313,7 +329,7 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||||
```
|
```
|
||||||
or run by script:
|
or run by script:
|
||||||
|
|
||||||
|
@ -324,7 +340,7 @@ or run by script:
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
Otherwise, you can run the script:
|
Otherwise, you can run the script:
|
||||||
|
@ -394,15 +410,9 @@ Output (example):
|
||||||
|
|
||||||
4. Install build tools
|
4. Install build tools
|
||||||
|
|
||||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||||
|
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||||
|
|
||||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
|
||||||
|
|
||||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
|
||||||
|
|
||||||
- Extract `w64devkit` on your pc.
|
|
||||||
|
|
||||||
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
|
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build llama.cpp
|
||||||
|
|
||||||
|
@ -412,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
# Option 2: Or FP16
|
# Option 2: Or FP16
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
cmake --build build --config Release -j
|
cmake --build build --config Release -j
|
||||||
```
|
```
|
||||||
|
@ -425,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
||||||
.\examples\sycl\win-build-sycl.bat
|
.\examples\sycl\win-build-sycl.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Or, use CMake presets to build:
|
||||||
|
```sh
|
||||||
|
cmake --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake --preset x64-windows-sycl-debug
|
||||||
|
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||||
|
```
|
||||||
|
|
||||||
|
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
|
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
|
@ -488,13 +512,13 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||||
```
|
```
|
||||||
Otherwise, run the following wrapper script:
|
Otherwise, run the following wrapper script:
|
||||||
|
|
||||||
|
|
62
README.md
62
README.md
|
@ -10,6 +10,9 @@
|
||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||||
|
|
||||||
### Recent API changes
|
### Recent API changes
|
||||||
|
|
||||||
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
||||||
|
@ -192,6 +195,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||||
|
- [RAGNA Desktop](https://ragna.app/) (proprietary)
|
||||||
- [RecurseChat](https://recurse.chat/) (proprietary)
|
- [RecurseChat](https://recurse.chat/) (proprietary)
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
|
@ -205,6 +209,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
|
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
|
@ -217,7 +222,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
|
@ -383,6 +388,30 @@ brew install llama.cpp
|
||||||
```
|
```
|
||||||
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
|
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
|
||||||
|
|
||||||
|
### Nix
|
||||||
|
|
||||||
|
On Mac and Linux, the Nix package manager can be used via
|
||||||
|
```
|
||||||
|
nix profile install nixpkgs#llama-cpp
|
||||||
|
```
|
||||||
|
For flake enabled installs.
|
||||||
|
|
||||||
|
Or
|
||||||
|
```
|
||||||
|
nix-env --file '<nixpkgs>' --install --attr llama-cpp
|
||||||
|
```
|
||||||
|
For non-flake enabled installs.
|
||||||
|
|
||||||
|
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
||||||
|
|
||||||
|
#### Flox
|
||||||
|
|
||||||
|
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
|
||||||
|
```
|
||||||
|
flox install llama-cpp
|
||||||
|
```
|
||||||
|
Flox follows the nixpkgs build of llama.cpp.
|
||||||
|
|
||||||
### Metal Build
|
### Metal Build
|
||||||
|
|
||||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||||
|
@ -555,7 +584,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Build the image
|
# Build the image
|
||||||
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
|
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
|
||||||
|
|
||||||
# Then, use it:
|
# Then, use it:
|
||||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
@ -586,7 +615,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
cmake -B build -DLLAMA_VULKAN=1
|
cmake -B build -DLLAMA_VULKAN=1
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||||
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||||
|
|
||||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||||
|
@ -619,21 +648,18 @@ python3 -m pip install -r requirements.txt
|
||||||
# convert the model to ggml FP16 format
|
# convert the model to ggml FP16 format
|
||||||
python3 convert-hf-to-gguf.py models/mymodel/
|
python3 convert-hf-to-gguf.py models/mymodel/
|
||||||
|
|
||||||
# [Optional] for models using BPE tokenizers
|
|
||||||
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
|
|
||||||
|
|
||||||
# quantize the model to 4-bits (using Q4_K_M method)
|
# quantize the model to 4-bits (using Q4_K_M method)
|
||||||
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
|
|
||||||
# update the gguf filetype to current version if older version is now unsupported
|
# update the gguf filetype to current version if older version is now unsupported
|
||||||
./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
||||||
```
|
```
|
||||||
|
|
||||||
### Run the quantized model
|
### Run the quantized model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# start inference on a gguf model
|
# start inference on a gguf model
|
||||||
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
@ -708,7 +734,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
|
||||||
#### How to run
|
#### How to run
|
||||||
|
|
||||||
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||||
3. Output:
|
3. Output:
|
||||||
```
|
```
|
||||||
perplexity : calculating perplexity over 655 chunks
|
perplexity : calculating perplexity over 655 chunks
|
||||||
|
@ -732,16 +758,16 @@ Here is an example of a few-shot interaction, invoked with the command
|
||||||
./examples/chat-13B.sh
|
./examples/chat-13B.sh
|
||||||
|
|
||||||
# custom arguments using a 13B model
|
# custom arguments using a 13B model
|
||||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Persistent Interaction
|
### Persistent Interaction
|
||||||
|
|
||||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start a new chat
|
# Start a new chat
|
||||||
|
@ -763,7 +789,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
```
|
```
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
@ -842,7 +868,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
|
||||||
Now, you can start chatting:
|
Now, you can start chatting:
|
||||||
```
|
```
|
||||||
$cd /data/data/com.termux/files/home/bin
|
$cd /data/data/com.termux/files/home/bin
|
||||||
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's a demo of an interactive session running on Pixel 5 phone:
|
Here's a demo of an interactive session running on Pixel 5 phone:
|
||||||
|
@ -909,8 +935,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
|
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
@ -960,7 +986,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
|
||||||
|
|
||||||
### Docs
|
### Docs
|
||||||
|
|
||||||
- [main](./examples/main/README.md)
|
- [main (cli)](./examples/main/README.md)
|
||||||
- [server](./examples/server/README.md)
|
- [server](./examples/server/README.md)
|
||||||
- [jeopardy](./examples/jeopardy/README.md)
|
- [jeopardy](./examples/jeopardy/README.md)
|
||||||
- [BLIS](./docs/BLIS.md)
|
- [BLIS](./docs/BLIS.md)
|
||||||
|
|
224
ci/run.sh
224
ci/run.sh
|
@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -437,45 +437,45 @@ function gg_run_pythia_1_4b {
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -569,47 +569,47 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -693,10 +693,10 @@ function gg_run_embd_bge_small {
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
14
codecov.yml
14
codecov.yml
|
@ -1,14 +0,0 @@
|
||||||
comment: off
|
|
||||||
|
|
||||||
coverage:
|
|
||||||
status:
|
|
||||||
project:
|
|
||||||
default:
|
|
||||||
target: auto
|
|
||||||
threshold: 0
|
|
||||||
base: auto
|
|
||||||
patch:
|
|
||||||
default:
|
|
||||||
target: auto
|
|
||||||
threshold: 0
|
|
||||||
base: auto
|
|
|
@ -1576,6 +1576,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
params.out_file = argv[i];
|
params.out_file = argv[i];
|
||||||
|
params.cvector_outfile = argv[i];
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "-ofreq" || arg == "--output-frequency") {
|
if (arg == "-ofreq" || arg == "--output-frequency") {
|
||||||
|
@ -1610,6 +1611,55 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.i_chunk = std::stoi(argv[i]);
|
params.i_chunk = std::stoi(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// cvector params
|
||||||
|
if (arg == "--completions-file") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.cvector_completions_file = argv[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--positive-file") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.cvector_positive_file = argv[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--negative-file") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.cvector_negative_file = argv[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--completions") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.n_completions = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--pca-batch") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.n_pca_batch = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--pca-iter") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.n_pca_iterations = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
// Parse args for logging parameters
|
// Parse args for logging parameters
|
||||||
if (log_param_single_parse(argv[i])) {
|
if (log_param_single_parse(argv[i])) {
|
||||||
|
@ -1931,6 +1981,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
options.push_back({ "cvector" });
|
||||||
|
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
||||||
|
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
||||||
|
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
||||||
|
options.push_back({ "cvector", " --completions-file FNAME",
|
||||||
|
"completions file (default: '%s')", params.cvector_completions_file.c_str() });
|
||||||
|
options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
|
||||||
|
options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
||||||
|
options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
|
||||||
for (const auto & o : options) {
|
for (const auto & o : options) {
|
||||||
|
|
|
@ -73,7 +73,6 @@ struct gpt_params {
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
|
||||||
int32_t grp_attn_n = 1; // group-attention factor
|
int32_t grp_attn_n = 1; // group-attention factor
|
||||||
int32_t grp_attn_w = 512; // group-attention width
|
int32_t grp_attn_w = 512; // group-attention width
|
||||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||||
|
@ -232,6 +231,15 @@ struct gpt_params {
|
||||||
|
|
||||||
bool process_output = false; // collect data for the output tensor
|
bool process_output = false; // collect data for the output tensor
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
||||||
|
// cvector-generator params
|
||||||
|
int n_completions = 64;
|
||||||
|
int n_pca_batch = 20;
|
||||||
|
int n_pca_iterations = 1000;
|
||||||
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
|
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
|
||||||
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
|
|
|
@ -83,6 +83,7 @@ models = [
|
||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
|
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -477,6 +477,9 @@ class Model:
|
||||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||||
res = "smaug-bpe"
|
res = "smaug-bpe"
|
||||||
|
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
||||||
|
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
||||||
|
res = "poro-chat"
|
||||||
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
||||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
||||||
res = "jina-v2-code"
|
res = "jina-v2-code"
|
||||||
|
@ -1671,6 +1674,12 @@ class Qwen2MoeModel(Model):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
if (n_experts := self.hparams.get("num_experts")) is not None:
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||||
self.gguf_writer.add_expert_count(n_experts)
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
|
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||||
|
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
||||||
|
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||||
|
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||||
|
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
|
|
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
## GGUF specification
|
## GGUF specification
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
## Verifying that the model is running on the GPU with CUDA
|
## Verifying that the model is running on the GPU with CUDA
|
||||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||||
```shell
|
```shell
|
||||||
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||||
```
|
```
|
||||||
|
|
||||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||||
|
@ -27,7 +27,7 @@ RAM: 32GB
|
||||||
|
|
||||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
||||||
|
|
||||||
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||||
|
|
||||||
Result:
|
Result:
|
||||||
|
|
||||||
|
|
|
@ -12,43 +12,45 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
|
add_subdirectory(cvector-generator)
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
add_subdirectory(batched)
|
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
|
add_subdirectory(batched)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
add_subdirectory(export-lora)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gbnf-validator)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
add_subdirectory(gguf)
|
||||||
|
add_subdirectory(gritlm)
|
||||||
|
add_subdirectory(imatrix)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
if (LLAMA_SYCL)
|
|
||||||
add_subdirectory(sycl)
|
|
||||||
endif()
|
|
||||||
add_subdirectory(main)
|
|
||||||
add_subdirectory(tokenize)
|
|
||||||
add_subdirectory(parallel)
|
|
||||||
add_subdirectory(perplexity)
|
|
||||||
add_subdirectory(quantize)
|
|
||||||
add_subdirectory(quantize-stats)
|
|
||||||
add_subdirectory(retrieval)
|
|
||||||
add_subdirectory(save-load-state)
|
|
||||||
add_subdirectory(simple)
|
|
||||||
add_subdirectory(passkey)
|
|
||||||
add_subdirectory(speculative)
|
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(gguf)
|
add_subdirectory(main)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(passkey)
|
||||||
if (LLAMA_BUILD_SERVER)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(server)
|
add_subdirectory(quantize-stats)
|
||||||
endif()
|
add_subdirectory(quantize)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(retrieval)
|
||||||
if (LLAMA_RPC)
|
if (LLAMA_RPC)
|
||||||
add_subdirectory(rpc)
|
add_subdirectory(rpc)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_BUILD_SERVER)
|
||||||
|
add_subdirectory(server)
|
||||||
|
endif()
|
||||||
|
if (LLAMA_SYCL)
|
||||||
|
add_subdirectory(sycl)
|
||||||
|
endif()
|
||||||
|
add_subdirectory(save-load-state)
|
||||||
|
add_subdirectory(simple)
|
||||||
|
add_subdirectory(speculative)
|
||||||
|
add_subdirectory(tokenize)
|
||||||
|
add_subdirectory(train-text-from-scratch)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./llama-cli "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--in-prefix " " \
|
--in-prefix " " \
|
||||||
--in-suffix "${AI_NAME}:" \
|
--in-suffix "${AI_NAME}:" \
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET baby-llama)
|
set(TARGET llama-baby-llama)
|
||||||
add_executable(${TARGET} baby-llama.cpp)
|
add_executable(${TARGET} baby-llama.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -58,4 +58,4 @@ echo "$2
|
||||||
model=$1
|
model=$1
|
||||||
|
|
||||||
# generate the most likely continuation until the string "===" is found
|
# generate the most likely continuation until the string "===" is found
|
||||||
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched-bench)
|
set(TARGET llama-batched-bench)
|
||||||
add_executable(${TARGET} batched-bench.cpp)
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
||||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||||
|
|
||||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||||
|
|
||||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||||
|
|
||||||
# custom set of batches
|
# custom set of batches
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sample results
|
## Sample results
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
.PHONY: build
|
.PHONY: build
|
||||||
|
|
||||||
build:
|
build:
|
||||||
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
rm -f ./batched_swift
|
rm -f ./llama-batched-swift
|
||||||
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
platforms: [.macOS(.v12)],
|
platforms: [.macOS(.v12)],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(name: "llama", path: "../../"),
|
.package(name: "llama", path: "../../"),
|
||||||
|
@ -13,7 +13,7 @@ let package = Package(
|
||||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
// Targets can depend on other targets in this package and products from dependencies.
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
.executableTarget(
|
.executableTarget(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
dependencies: ["llama"],
|
dependencies: ["llama"],
|
||||||
path: "Sources",
|
path: "Sources",
|
||||||
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
This is a swift clone of `examples/batched`.
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
$ `make`
|
$ `make`
|
||||||
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched)
|
set(TARGET llama-batched)
|
||||||
add_executable(${TARGET} batched.cpp)
|
add_executable(${TARGET} batched.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
The example demonstrates batched generation from a given prompt
|
The example demonstrates batched generation from a given prompt
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET benchmark)
|
set(TARGET llama-bench-matmult)
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./main $GEN_OPTIONS \
|
./llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -62,7 +62,7 @@ fi
|
||||||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||||
echo 'Prompt cache does not exist, building...'
|
echo 'Prompt cache does not exist, building...'
|
||||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||||
./main 2>>"$LOG" \
|
./llama-cli 2>>"$LOG" \
|
||||||
--batch_size 64 \
|
--batch_size 64 \
|
||||||
"${OPTS[@]}" \
|
"${OPTS[@]}" \
|
||||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||||
|
@ -109,13 +109,13 @@ while read -e line; do
|
||||||
|
|
||||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||||
|
|
||||||
./main 2>>"$LOG" "${OPTS[@]}" \
|
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
|
||||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||||
--prompt-cache-all \
|
--prompt-cache-all \
|
||||||
--file "$CUR_PROMPT_FILE" \
|
--file "$CUR_PROMPT_FILE" \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--n_predict "$n_predict" |
|
--n_predict "$n_predict" |
|
||||||
skip_bytes 1 | # skip BOS token added by ./main
|
skip_bytes 1 | # skip BOS token added by ./llama-cli
|
||||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||||
skip_bytes "$n_prompt_len_pre" # print generation
|
skip_bytes "$n_prompt_len_pre" # print generation
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ while read -e line; do
|
||||||
# TODO get both messages in one go
|
# TODO get both messages in one go
|
||||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ while read -e line; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Update cache for next prompt in background, ideally during user input
|
# Update cache for next prompt in background, ideally during user input
|
||||||
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||||
--file "$NEXT_PROMPT_FILE" \
|
--file "$NEXT_PROMPT_FILE" \
|
||||||
--n_predict 1 &
|
--n_predict 1 &
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./bin/main $GEN_OPTIONS \
|
./bin/llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -11,6 +11,6 @@ cd ..
|
||||||
#
|
#
|
||||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||||
#
|
#
|
||||||
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||||
--repeat_penalty 1.0 --color -i \
|
--repeat_penalty 1.0 --color -i \
|
||||||
-r "User:" -f prompts/chat-with-bob.txt
|
-r "User:" -f prompts/chat-with-bob.txt
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET convert-llama2c-to-ggml)
|
set(TARGET llama-convert-llama2c-to-ggml)
|
||||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
|
||||||
|
|
||||||
After successful compilation, following usage options are available:
|
After successful compilation, following usage options are available:
|
||||||
```
|
```
|
||||||
usage: ./convert-llama2c-to-ggml [options]
|
usage: ./llama-convert-llama2c-to-ggml [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -19,10 +19,10 @@ options:
|
||||||
|
|
||||||
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||||
|
|
||||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||||
|
|
||||||
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
||||||
|
|
||||||
Now you can use the model with a command like:
|
Now you can use the model with a command like:
|
||||||
|
|
||||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||||
|
|
5
examples/cvector-generator/CMakeLists.txt
Normal file
5
examples/cvector-generator/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET llama-cvector-generator)
|
||||||
|
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
34
examples/cvector-generator/README.md
Normal file
34
examples/cvector-generator/README.md
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# cvector-generator
|
||||||
|
|
||||||
|
This example demonstrates how to generate a control vector using gguf models.
|
||||||
|
|
||||||
|
Related PRs:
|
||||||
|
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
|
||||||
|
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
|
||||||
|
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# CPU only
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
|
||||||
|
|
||||||
|
# With GPU
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
||||||
|
|
||||||
|
# With advanced options
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
|
||||||
|
|
||||||
|
# To see help message
|
||||||
|
./cvector-generator -h
|
||||||
|
# Then, have a look at "cvector" section
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tips and tricks
|
||||||
|
|
||||||
|
If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
||||||
|
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
||||||
|
```
|
582
examples/cvector-generator/completions.txt
Normal file
582
examples/cvector-generator/completions.txt
Normal file
|
@ -0,0 +1,582 @@
|
||||||
|
|
||||||
|
That game
|
||||||
|
I can see
|
||||||
|
Hmm, this
|
||||||
|
I can relate to
|
||||||
|
Who is
|
||||||
|
I understand the
|
||||||
|
Ugh,
|
||||||
|
What the hell was
|
||||||
|
Hey, did anyone
|
||||||
|
Although
|
||||||
|
Thank you for choosing
|
||||||
|
What are you
|
||||||
|
Oh w
|
||||||
|
How dare you open
|
||||||
|
It was my pleasure
|
||||||
|
I'm hon
|
||||||
|
I appreciate that you
|
||||||
|
Are you k
|
||||||
|
Whoever left this
|
||||||
|
It's always
|
||||||
|
Ew,
|
||||||
|
Hey, I l
|
||||||
|
Hello? Is someone
|
||||||
|
I understand that
|
||||||
|
That poem
|
||||||
|
Aww, poor
|
||||||
|
Hey, it
|
||||||
|
Alright, who
|
||||||
|
I didn't
|
||||||
|
Well, life
|
||||||
|
The document
|
||||||
|
Oh no, this
|
||||||
|
I'm concerned
|
||||||
|
Hello, this is
|
||||||
|
This art
|
||||||
|
Hmm, this drink
|
||||||
|
Hi there!
|
||||||
|
It seems
|
||||||
|
Is
|
||||||
|
Good
|
||||||
|
I can't
|
||||||
|
Ex
|
||||||
|
Who are
|
||||||
|
I can see that
|
||||||
|
Wow,
|
||||||
|
Today is a
|
||||||
|
Hey friend
|
||||||
|
Sometimes friends
|
||||||
|
Oh, this old
|
||||||
|
The weather outside
|
||||||
|
This place is sur
|
||||||
|
I appreciate your input
|
||||||
|
Thank you for the
|
||||||
|
Look at
|
||||||
|
I'm disappoint
|
||||||
|
To my
|
||||||
|
How dare you
|
||||||
|
That's an
|
||||||
|
This piece of art
|
||||||
|
Eww
|
||||||
|
This park is
|
||||||
|
This is incredible
|
||||||
|
Oh no, someone
|
||||||
|
Exc
|
||||||
|
Well, it'
|
||||||
|
I warned
|
||||||
|
Hey, I understand
|
||||||
|
Hey, I saw
|
||||||
|
How dare you go
|
||||||
|
What the he
|
||||||
|
Hey
|
||||||
|
It's
|
||||||
|
Hello? Hello?
|
||||||
|
It
|
||||||
|
Oh no!
|
||||||
|
This is the perfect
|
||||||
|
Good morning,
|
||||||
|
Oh no, there
|
||||||
|
It's so
|
||||||
|
Yeah
|
||||||
|
Uh,
|
||||||
|
Hello everyone
|
||||||
|
Who turned off
|
||||||
|
The weather
|
||||||
|
Who'
|
||||||
|
Hey, this
|
||||||
|
Wait,
|
||||||
|
Eww, gross
|
||||||
|
Excuse
|
||||||
|
It seems like you
|
||||||
|
Thank you so
|
||||||
|
What happened?
|
||||||
|
Oh my g
|
||||||
|
I am deeply sad
|
||||||
|
I war
|
||||||
|
Okay, let'
|
||||||
|
Hey, that
|
||||||
|
That was a beautiful
|
||||||
|
Oh no! That
|
||||||
|
What happened
|
||||||
|
Hey there
|
||||||
|
The artist'
|
||||||
|
What?!
|
||||||
|
Hey, it'
|
||||||
|
I am disappoint
|
||||||
|
It seems like
|
||||||
|
Oh no! The
|
||||||
|
This park is a
|
||||||
|
If you
|
||||||
|
Yes! I did
|
||||||
|
It sounds
|
||||||
|
What
|
||||||
|
Who is it
|
||||||
|
Hmm, that
|
||||||
|
That's strange
|
||||||
|
Yeah, that was
|
||||||
|
That's interesting
|
||||||
|
This park
|
||||||
|
What the hell
|
||||||
|
Who is that
|
||||||
|
I feel like my
|
||||||
|
Oh well
|
||||||
|
What the hell is
|
||||||
|
Hello? Hello
|
||||||
|
To my dearest
|
||||||
|
Bless you!\"
|
||||||
|
Thank you for
|
||||||
|
Oh, looks like
|
||||||
|
Can you please
|
||||||
|
This place is
|
||||||
|
Eww, what
|
||||||
|
Bless you
|
||||||
|
Is everything
|
||||||
|
Hey, I just
|
||||||
|
Whoever left these
|
||||||
|
Well, that'
|
||||||
|
I feel
|
||||||
|
Hey, do you
|
||||||
|
It's sad
|
||||||
|
Oh no, it
|
||||||
|
Hey, that'
|
||||||
|
Oh my god,
|
||||||
|
Thank you,
|
||||||
|
Hello little one,
|
||||||
|
I apolog
|
||||||
|
Hey team, I
|
||||||
|
How dare you read
|
||||||
|
Who is this and
|
||||||
|
Whoever left
|
||||||
|
Hi there! W
|
||||||
|
A
|
||||||
|
If you have
|
||||||
|
I was
|
||||||
|
U
|
||||||
|
Bless
|
||||||
|
Well, this
|
||||||
|
Oh, I'
|
||||||
|
It's a
|
||||||
|
Eww,
|
||||||
|
Is everything okay?
|
||||||
|
Oh, I
|
||||||
|
Hello, can you
|
||||||
|
Al
|
||||||
|
That was a great
|
||||||
|
What are
|
||||||
|
I understand that not
|
||||||
|
Oh no, not
|
||||||
|
Who is it?\"
|
||||||
|
Hey, can we
|
||||||
|
Whoever is taking
|
||||||
|
I would love to
|
||||||
|
Hey, I noticed
|
||||||
|
Hey, could
|
||||||
|
I understand that there
|
||||||
|
Hello?
|
||||||
|
D
|
||||||
|
Oh man, I
|
||||||
|
Thank you so much
|
||||||
|
Oh no, my
|
||||||
|
Dear [Name
|
||||||
|
Uh
|
||||||
|
I remember
|
||||||
|
Hey, who
|
||||||
|
Well, it
|
||||||
|
Are you
|
||||||
|
I understand that it
|
||||||
|
Hey, is
|
||||||
|
I would
|
||||||
|
Who is this
|
||||||
|
Excuse me
|
||||||
|
Alright
|
||||||
|
I am thrilled
|
||||||
|
Sometimes friends have
|
||||||
|
Who the
|
||||||
|
It's interesting
|
||||||
|
I would love
|
||||||
|
E
|
||||||
|
Hello? Is anyone
|
||||||
|
Well, this is
|
||||||
|
This place
|
||||||
|
Well,
|
||||||
|
I warned you
|
||||||
|
Hey, watch where
|
||||||
|
Oh my
|
||||||
|
That'
|
||||||
|
Sometimes friends have different
|
||||||
|
I understand that everyone
|
||||||
|
What?
|
||||||
|
What do these notes
|
||||||
|
I can relate
|
||||||
|
I'm not
|
||||||
|
I understand
|
||||||
|
To my dear
|
||||||
|
Guys
|
||||||
|
Well
|
||||||
|
Hey, I appreciate
|
||||||
|
Wow, what
|
||||||
|
Dear
|
||||||
|
That melody
|
||||||
|
Who the hell
|
||||||
|
Today is
|
||||||
|
Hello little
|
||||||
|
Wow, look
|
||||||
|
That's great
|
||||||
|
Love is never wrong
|
||||||
|
I'm having
|
||||||
|
Whoa, did
|
||||||
|
Ugh
|
||||||
|
Can you please provide
|
||||||
|
I miss you,
|
||||||
|
I feel uncom
|
||||||
|
I know
|
||||||
|
Ugh, this
|
||||||
|
Hey, watch
|
||||||
|
Oh great, a
|
||||||
|
I didn
|
||||||
|
Okay
|
||||||
|
That game of char
|
||||||
|
Oh
|
||||||
|
I appreciate
|
||||||
|
Who's there
|
||||||
|
I am so
|
||||||
|
Oh great, someone
|
||||||
|
Hey, could you
|
||||||
|
I remember wondering
|
||||||
|
Wait, what?
|
||||||
|
What do
|
||||||
|
Hello? Can
|
||||||
|
Hey there,
|
||||||
|
That game of
|
||||||
|
This is incred
|
||||||
|
Oh my gosh
|
||||||
|
Oh great, f
|
||||||
|
I appreciate your
|
||||||
|
It sounds like
|
||||||
|
What the heck
|
||||||
|
Okay, I understand
|
||||||
|
Ew
|
||||||
|
I understand that this
|
||||||
|
Uh, hi
|
||||||
|
Hi everyone!
|
||||||
|
What the hell?
|
||||||
|
Thank you for your
|
||||||
|
Oh no, the
|
||||||
|
Wow, I
|
||||||
|
Who turned
|
||||||
|
Dear [
|
||||||
|
Whoever
|
||||||
|
This is a
|
||||||
|
Whoa, he
|
||||||
|
What in the world
|
||||||
|
Although the physical
|
||||||
|
Hello, who is
|
||||||
|
That's amaz
|
||||||
|
Hey, I know
|
||||||
|
Okay, that
|
||||||
|
Hi everyone
|
||||||
|
Hey, is everything
|
||||||
|
I understand your fr
|
||||||
|
Oh no, poor
|
||||||
|
Oh, look
|
||||||
|
Good morning
|
||||||
|
Ew, gross
|
||||||
|
Oh no, did
|
||||||
|
Look at the family
|
||||||
|
Hey team
|
||||||
|
Yes!
|
||||||
|
Hey, can I
|
||||||
|
Okay, that'
|
||||||
|
It's great
|
||||||
|
Love is
|
||||||
|
Hey, what
|
||||||
|
Good morning, world
|
||||||
|
Who is it?
|
||||||
|
That poem really reson
|
||||||
|
I
|
||||||
|
That's
|
||||||
|
I understand the task
|
||||||
|
Gu
|
||||||
|
Hello? Who'
|
||||||
|
This postcard is
|
||||||
|
Whoa,
|
||||||
|
Oh, that
|
||||||
|
I understand that I
|
||||||
|
Whoever is
|
||||||
|
Hello? Who is
|
||||||
|
I'm really
|
||||||
|
Wow, this
|
||||||
|
Can
|
||||||
|
This artwork really
|
||||||
|
This is a shame
|
||||||
|
I miss you too
|
||||||
|
Who are you?
|
||||||
|
Today is a difficult
|
||||||
|
Hey, just
|
||||||
|
Are you okay
|
||||||
|
I am
|
||||||
|
Hi,
|
||||||
|
Wow, that
|
||||||
|
Hey there! Can
|
||||||
|
Okay, stay
|
||||||
|
Oh great, just
|
||||||
|
Yeah,
|
||||||
|
Hello? Can you
|
||||||
|
Oh, looks
|
||||||
|
Thank you for sharing
|
||||||
|
I'm glad
|
||||||
|
Hey, is that
|
||||||
|
Hmm
|
||||||
|
It was my
|
||||||
|
It sounds like you
|
||||||
|
Wow, your
|
||||||
|
I was promised certain
|
||||||
|
That was such a
|
||||||
|
Thank
|
||||||
|
Excuse you
|
||||||
|
That was
|
||||||
|
Hey team,
|
||||||
|
I feel un
|
||||||
|
It was
|
||||||
|
What'
|
||||||
|
Hey friend, I
|
||||||
|
How
|
||||||
|
Saying goodbye
|
||||||
|
That
|
||||||
|
It's heart
|
||||||
|
How dare
|
||||||
|
Oh,
|
||||||
|
Hello, may
|
||||||
|
What's this
|
||||||
|
Thank you for recogn
|
||||||
|
Aww, that
|
||||||
|
Oh, I remember
|
||||||
|
Hmm, that'
|
||||||
|
I miss
|
||||||
|
I know this
|
||||||
|
Wait
|
||||||
|
Is everything okay
|
||||||
|
Who is that person
|
||||||
|
Wow, you
|
||||||
|
Oh great
|
||||||
|
I'm sad
|
||||||
|
Wow, the
|
||||||
|
I am very disappoint
|
||||||
|
Who turned off the
|
||||||
|
I understand that things
|
||||||
|
I'm very
|
||||||
|
Hi
|
||||||
|
That's very
|
||||||
|
Okay, I
|
||||||
|
Oh no,
|
||||||
|
Wow, there
|
||||||
|
What's wrong
|
||||||
|
I apologize for
|
||||||
|
Hey, I
|
||||||
|
Can I help you
|
||||||
|
Oh, I didn
|
||||||
|
Alright,
|
||||||
|
Oh wow,
|
||||||
|
Oh my goodness
|
||||||
|
I know this event
|
||||||
|
What in the
|
||||||
|
Saying
|
||||||
|
Yeah, that
|
||||||
|
Guys, I
|
||||||
|
Hey, this v
|
||||||
|
This post
|
||||||
|
Are
|
||||||
|
Hey, can
|
||||||
|
Hello? Is
|
||||||
|
I can only imagine
|
||||||
|
Oh, that sounds
|
||||||
|
Hey, is anyone
|
||||||
|
I am disappointed
|
||||||
|
Hello,
|
||||||
|
Hey everyone, I
|
||||||
|
That was such
|
||||||
|
It's okay
|
||||||
|
The artist
|
||||||
|
Whoa
|
||||||
|
I understand that mistakes
|
||||||
|
Can I help
|
||||||
|
Who
|
||||||
|
Hi everyone! I
|
||||||
|
Hey, can you
|
||||||
|
Wow, how
|
||||||
|
Today
|
||||||
|
Oh no, I
|
||||||
|
Oh well, I
|
||||||
|
Well, that
|
||||||
|
This is the
|
||||||
|
Yes! I finally
|
||||||
|
Hey there little
|
||||||
|
Hello everyone!
|
||||||
|
Love is never
|
||||||
|
Look at the
|
||||||
|
This postcard
|
||||||
|
Oh great,
|
||||||
|
Can I
|
||||||
|
Hmm, this is
|
||||||
|
I understand your
|
||||||
|
Oh, look at
|
||||||
|
B
|
||||||
|
I'm so
|
||||||
|
Whoa, this
|
||||||
|
W
|
||||||
|
Oh, this
|
||||||
|
Sometimes
|
||||||
|
This piece of
|
||||||
|
What the
|
||||||
|
That was a
|
||||||
|
Hey, do
|
||||||
|
Oh no
|
||||||
|
Whoa, what
|
||||||
|
I feel like I
|
||||||
|
The documentary
|
||||||
|
Hello
|
||||||
|
Hello little one
|
||||||
|
I understand that my
|
||||||
|
Eww, that
|
||||||
|
Wow, an
|
||||||
|
Yes! Finally,
|
||||||
|
Although the physical location
|
||||||
|
Whoever is watching
|
||||||
|
That movie
|
||||||
|
I remember wondering about
|
||||||
|
Hey there, little
|
||||||
|
Who's
|
||||||
|
Hello, who
|
||||||
|
Hello everyone! Thank
|
||||||
|
Hello, can
|
||||||
|
That's too
|
||||||
|
Hey, just wanted
|
||||||
|
Hey there, I
|
||||||
|
Saying good
|
||||||
|
Hey there!
|
||||||
|
Who is there?
|
||||||
|
Oh my good
|
||||||
|
I am very
|
||||||
|
Oh no, what
|
||||||
|
Wow, thank
|
||||||
|
I was promised
|
||||||
|
Hi, is
|
||||||
|
Hey, I'
|
||||||
|
Guys, the
|
||||||
|
Oh no, that
|
||||||
|
Who is there
|
||||||
|
Hello, this
|
||||||
|
That movie really touched
|
||||||
|
If you have something
|
||||||
|
The documentary was
|
||||||
|
I'm starting
|
||||||
|
Are you kidd
|
||||||
|
That movie really
|
||||||
|
Hey everyone,
|
||||||
|
Thank you for considering
|
||||||
|
I didn'
|
||||||
|
Yes! I
|
||||||
|
Can you
|
||||||
|
Oh my god
|
||||||
|
Hey, whoever
|
||||||
|
That melody really
|
||||||
|
Thank you, little
|
||||||
|
Hello, may I
|
||||||
|
Look
|
||||||
|
Wow, we
|
||||||
|
It looks
|
||||||
|
What do these
|
||||||
|
Oh wow
|
||||||
|
I apologize
|
||||||
|
What are you all
|
||||||
|
It's such
|
||||||
|
It's clear
|
||||||
|
Hey, I was
|
||||||
|
Hey friend,
|
||||||
|
I can only
|
||||||
|
The weather outside is
|
||||||
|
Eww, this
|
||||||
|
I miss you
|
||||||
|
Wow
|
||||||
|
Aww,
|
||||||
|
Hi, is there
|
||||||
|
This artwork
|
||||||
|
Okay,
|
||||||
|
Oh well,
|
||||||
|
This
|
||||||
|
I'
|
||||||
|
Say
|
||||||
|
Hey there little gu
|
||||||
|
Hmm,
|
||||||
|
Whoa, who
|
||||||
|
I am thr
|
||||||
|
Oh man
|
||||||
|
Okay, stay calm
|
||||||
|
I'm happy
|
||||||
|
Oh, this cur
|
||||||
|
Oh man,
|
||||||
|
I'm sorry
|
||||||
|
Hello? Who
|
||||||
|
What?! That
|
||||||
|
This piece
|
||||||
|
Hey everyone
|
||||||
|
That's so
|
||||||
|
Are you okay?
|
||||||
|
What happened? Where
|
||||||
|
Hi there
|
||||||
|
The
|
||||||
|
Who the hell entered
|
||||||
|
I can
|
||||||
|
Guys,
|
||||||
|
What's
|
||||||
|
What in
|
||||||
|
It's important
|
||||||
|
I'm
|
||||||
|
I'm coming
|
||||||
|
It'
|
||||||
|
Yes! Finally
|
||||||
|
Wait, what
|
||||||
|
Wow, reading
|
||||||
|
I'm surprised
|
||||||
|
Hey, did
|
||||||
|
Hey,
|
||||||
|
Okay, let
|
||||||
|
I understand that you
|
||||||
|
Who the hell threw
|
||||||
|
Eww, who
|
||||||
|
Thank you for thinking
|
||||||
|
Who is this?\"
|
||||||
|
I am deeply
|
||||||
|
Thank you for including
|
||||||
|
Oh no, an
|
||||||
|
It looks like you
|
||||||
|
Aww
|
||||||
|
I'm confused
|
||||||
|
Wow, it
|
||||||
|
That poem really
|
||||||
|
Yes
|
||||||
|
Hey there, is
|
||||||
|
Hey, what'
|
||||||
|
Thank you for remember
|
||||||
|
To
|
||||||
|
This is
|
||||||
|
Thank you for making
|
||||||
|
I can'
|
||||||
|
That mel
|
||||||
|
Wow, they
|
||||||
|
I feel like
|
||||||
|
Although the
|
||||||
|
Who are you
|
||||||
|
Love
|
||||||
|
If
|
||||||
|
What the hell are
|
||||||
|
I am so sad
|
||||||
|
Oh, I found
|
||||||
|
Thank you
|
||||||
|
It looks like
|
||||||
|
Well, life is
|
||||||
|
I appreciate that
|
||||||
|
The artist's
|
||||||
|
Whoa, that
|
||||||
|
It's never
|
499
examples/cvector-generator/cvector-generator.cpp
Normal file
499
examples/cvector-generator/cvector-generator.cpp
Normal file
|
@ -0,0 +1,499 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "pca.hpp"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
// utils
|
||||||
|
|
||||||
|
template <class Iter>
|
||||||
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
|
std::string ret;
|
||||||
|
for (; begin != end; ++begin) {
|
||||||
|
ret += llama_token_to_piece(ctx, *begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
|
printf("\nexample usage:\n");
|
||||||
|
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
||||||
|
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
|
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
// cb_eval is reused for each pair of positive - negative prompt
|
||||||
|
struct callback_data {
|
||||||
|
ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
|
||||||
|
|
||||||
|
int n_layers = 0;
|
||||||
|
int n_tokens = 0;
|
||||||
|
bool is_eval_pos = true;
|
||||||
|
|
||||||
|
// each element of the vector correspond to one layer
|
||||||
|
std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
|
||||||
|
std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
|
||||||
|
std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
|
||||||
|
|
||||||
|
// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
|
||||||
|
void save_tensor_for_layer(struct ggml_tensor * t) {
|
||||||
|
GGML_ASSERT(t->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
if (ctx_ggml == nullptr) {
|
||||||
|
// alloc a new ctx_ggml if needed
|
||||||
|
struct ggml_init_params params_ggml = {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx_ggml = ggml_init(params_ggml);
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy tensor data
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
|
||||||
|
t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
|
||||||
|
ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
|
||||||
|
ggml_set_name(t_layer, ggml_get_name(t));
|
||||||
|
//print_debug_tensor(t_layer);
|
||||||
|
|
||||||
|
if (is_eval_pos) {
|
||||||
|
v_pos.push_back(t_layer);
|
||||||
|
} else {
|
||||||
|
v_neg.push_back(t_layer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate diff (v_pos - v_neg) and place the result back to v_pos
|
||||||
|
// all zero rows in the diff tensor will also be removed
|
||||||
|
// NOTE: final layer is ignored. we only have (n_layers - 1) to process
|
||||||
|
std::vector<struct ggml_tensor *> calc_diff() {
|
||||||
|
for (float il = 0; il < v_pos.size(); il++) {
|
||||||
|
float * a = (float *) v_pos[il]->data;
|
||||||
|
float * b = (float *) v_neg[il]->data;
|
||||||
|
size_t n_elem = ggml_nelements(v_pos[il]);
|
||||||
|
for (size_t j = 0; j < n_elem; j++) {
|
||||||
|
a[j] -= b[j];
|
||||||
|
}
|
||||||
|
//print_debug_tensor(v_pos[i]);
|
||||||
|
auto diff_filtered = filter_nonzero_rows(v_pos[il]);
|
||||||
|
v_diff_filtered.push_back(diff_filtered);
|
||||||
|
}
|
||||||
|
return v_diff_filtered; // for convinient, we return the result std::vector
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete zero rows from a given 2D tensor
|
||||||
|
struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
|
||||||
|
//printf("filter_nonzero_rows\n");
|
||||||
|
auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
|
||||||
|
// check if given row containing all zero elements
|
||||||
|
int n_cols = t->ne[0]; // hint: should be equal to n_embd
|
||||||
|
for (int col = 0; col < n_cols; ++col) {
|
||||||
|
if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
|
||||||
|
for (int i_row = 0; i_row < a->ne[1]; i_row++) {
|
||||||
|
if (!is_row_all_zeros(a, i_row, 1e-6)) {
|
||||||
|
rows_to_copy.push_back(i_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get "n_nonzero_rows" for the output "diff_filtered"
|
||||||
|
int n_nonzero_rows = rows_to_copy.size();
|
||||||
|
//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
|
||||||
|
int n_embd = a->ne[0];
|
||||||
|
GGML_ASSERT(n_nonzero_rows > 0);
|
||||||
|
|
||||||
|
// diff_filtered: [n_embd, n_nonzero_rows]
|
||||||
|
struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
|
||||||
|
ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
|
||||||
|
ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
|
||||||
|
diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
|
||||||
|
|
||||||
|
// copy non-zero rows
|
||||||
|
for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
|
||||||
|
int src_row = rows_to_copy[dest_row];
|
||||||
|
for (int i = 0; i < n_embd; i++) {
|
||||||
|
float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
|
||||||
|
ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//print_debug_tensor(diff_filtered);
|
||||||
|
|
||||||
|
return diff_filtered;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
|
||||||
|
void reset() {
|
||||||
|
for (auto ptr : v_pos) free(ptr->data);
|
||||||
|
for (auto ptr : v_neg) free(ptr->data);
|
||||||
|
for (auto ptr : v_diff_filtered) free(ptr->data);
|
||||||
|
v_pos.clear();
|
||||||
|
v_neg.clear();
|
||||||
|
v_diff_filtered.clear();
|
||||||
|
if (ctx_ggml) {
|
||||||
|
ggml_free(ctx_ggml);
|
||||||
|
}
|
||||||
|
ctx_ggml = nullptr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* process_ctx is used to store the ggml context for pre-post processing the diff vectors
|
||||||
|
* in short, input => v_diff and output => v_final
|
||||||
|
*/
|
||||||
|
struct train_context {
|
||||||
|
ggml_context * ctx_ggml;
|
||||||
|
int n_embd;
|
||||||
|
int n_layers;
|
||||||
|
|
||||||
|
/* pair of prompts to be used for generating final vector */
|
||||||
|
std::vector<std::string> positive_entries;
|
||||||
|
std::vector<std::string> negative_entries;
|
||||||
|
|
||||||
|
// each element of the vector correspond to one layer
|
||||||
|
// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
|
||||||
|
// NOTE (2): v_diff is transposed from v_diff_tmp
|
||||||
|
std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
|
||||||
|
std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
|
||||||
|
|
||||||
|
// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
|
||||||
|
// v_diff_tmp will get converted unto v_diff later on
|
||||||
|
std::vector<std::vector<uint8_t>> v_diff_tmp;
|
||||||
|
|
||||||
|
train_context(int n_embd_, int n_layers_) {
|
||||||
|
n_embd = n_embd_;
|
||||||
|
n_layers = n_layers_;
|
||||||
|
struct ggml_init_params params_ggml = {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx_ggml = ggml_init(params_ggml);
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
std::vector<uint8_t> empty;
|
||||||
|
v_diff_tmp.push_back(empty);
|
||||||
|
auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
|
||||||
|
t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
|
||||||
|
v_final.push_back(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add new rows into existing tensor in v_diff_tmp
|
||||||
|
void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
|
||||||
|
GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
auto t = diff_filtered[il];
|
||||||
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
|
size_t curr_size = diff_tmp.size();
|
||||||
|
diff_tmp.resize(curr_size + ggml_nbytes(t));
|
||||||
|
memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
||||||
|
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
||||||
|
void build_v_diff() {
|
||||||
|
printf("build_v_diff\n");
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
|
int n_elem = diff_tmp.size() / sizeof(float);
|
||||||
|
GGML_ASSERT(n_elem % n_embd == 0);
|
||||||
|
int n_rows = n_elem / n_embd;
|
||||||
|
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
|
||||||
|
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
||||||
|
// copy data & transpose
|
||||||
|
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
||||||
|
float * arr = (float *) diff_tmp.data();
|
||||||
|
for (int ir = 0; ir < n_rows; ++ir) {
|
||||||
|
for (int ic = 0; ic < n_embd; ++ic) {
|
||||||
|
float f = arr[ir*n_embd + ic];
|
||||||
|
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
v_diff.push_back(diff);
|
||||||
|
print_debug_tensor(diff);
|
||||||
|
// free memory of diff_tmp
|
||||||
|
diff_tmp.resize(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~train_context() {
|
||||||
|
for (auto ptr : v_final) free(ptr->data);
|
||||||
|
for (auto ptr : v_diff) free(ptr->data);
|
||||||
|
// no need to free v_diff_tmp, since we didn't use malloc
|
||||||
|
ggml_free(ctx_ggml);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct tokenized_prompt {
|
||||||
|
std::vector<llama_token> tokens_pos;
|
||||||
|
std::vector<llama_token> tokens_neg;
|
||||||
|
size_t max_seq_len;
|
||||||
|
|
||||||
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
|
||||||
|
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
|
||||||
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||||
|
padding_seq(ctx, tokens_pos, max_seq_len);
|
||||||
|
padding_seq(ctx, tokens_neg, max_seq_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
|
||||||
|
// TODO: customize padding token
|
||||||
|
std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
|
||||||
|
llama_token pad_tok = pad_tokens.back();
|
||||||
|
while (tokens.size() < len) {
|
||||||
|
tokens.push_back(pad_tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static std::string to_string(const T & val) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << val;
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
|
||||||
|
std::vector<std::string> output;
|
||||||
|
std::ifstream file(path);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(file, line)) {
|
||||||
|
bool is_skip = skip_empty_lines && line.empty();
|
||||||
|
if (!is_skip) {
|
||||||
|
string_process_escapes(line);
|
||||||
|
output.push_back(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
auto * cb_data = (callback_data *) user_data;
|
||||||
|
static const char * l_out_name = "l_out";
|
||||||
|
const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
|
||||||
|
|
||||||
|
if (ask) {
|
||||||
|
return is_l_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// save the tensor to current context
|
||||||
|
cb_data->save_tensor_for_layer(t);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
|
||||||
|
struct gguf_context * ctx = gguf_init_empty();
|
||||||
|
|
||||||
|
const std::string arch = "controlvector";
|
||||||
|
gguf_set_val_str(ctx, "general.architecture", arch.c_str());
|
||||||
|
gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
|
||||||
|
gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < v_ctrl.size(); ++i) {
|
||||||
|
gguf_add_tensor(ctx, v_ctrl[i]);
|
||||||
|
print_debug_tensor(v_ctrl[i]);
|
||||||
|
printf("Added tensor: %s\n", v_ctrl[i]->name);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: writing file...\n", __func__);
|
||||||
|
gguf_write_to_file(ctx, fname.c_str(), false);
|
||||||
|
printf("%s: wrote file '%s'\n", __func__, fname.c_str());
|
||||||
|
gguf_free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load prompt files and completion file.
|
||||||
|
* Then format each pair of prompt + completion to make an entry.
|
||||||
|
*/
|
||||||
|
static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
|
// load prompts
|
||||||
|
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
|
||||||
|
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
|
||||||
|
if (positive_prompts.size() != negative_prompts.size()) {
|
||||||
|
fprintf(stderr, "number of positive and negative prompts must be equal\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (positive_prompts.empty()) {
|
||||||
|
fprintf(stderr, "must provide at least one prompt pair\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create templated prompts
|
||||||
|
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
||||||
|
auto format_template = [](std::string persona, std::string suffix) {
|
||||||
|
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
|
||||||
|
return persona + " " + suffix;
|
||||||
|
};
|
||||||
|
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
||||||
|
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
||||||
|
// TODO replicate the truncations done by the python implementation
|
||||||
|
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
|
||||||
|
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
print_usage(argc, argv, params);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.n_pca_iterations % params.n_pca_batch != 0) {
|
||||||
|
fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
callback_data cb_data;
|
||||||
|
|
||||||
|
// pass the callback to the backend scheduler
|
||||||
|
// it will be executed for each node during the graph computation
|
||||||
|
params.cb_eval = cb_eval;
|
||||||
|
params.cb_eval_user_data = &cb_data;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
|
print_build_info();
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
// load the model to get hparams
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// int n_ctx = llama_n_ctx(ctx);
|
||||||
|
int n_layers = llama_n_layer(model);
|
||||||
|
int n_embd = llama_n_embd(model);
|
||||||
|
// get model hint param (a.k.a model arch name)
|
||||||
|
char model_hint[128];
|
||||||
|
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
|
||||||
|
|
||||||
|
// init train_context
|
||||||
|
train_context ctx_train(n_embd, n_layers);
|
||||||
|
|
||||||
|
// load and prepare entries for training
|
||||||
|
prepare_entries(params, ctx_train);
|
||||||
|
|
||||||
|
// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
|
||||||
|
std::vector<tokenized_prompt> tokenized_prompts;
|
||||||
|
size_t n_total_tokens = 0;
|
||||||
|
for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
|
||||||
|
tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
|
||||||
|
n_total_tokens += 2 * t.max_seq_len;
|
||||||
|
tokenized_prompts.push_back(std::move(t));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
|
||||||
|
|
||||||
|
for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
|
||||||
|
bool success = false;
|
||||||
|
tokenized_prompt t = tokenized_prompts[i];
|
||||||
|
cb_data.n_layers = n_layers;
|
||||||
|
cb_data.n_tokens = t.max_seq_len;
|
||||||
|
|
||||||
|
printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
|
||||||
|
(int) i+1, (int) ctx_train.positive_entries.size(),
|
||||||
|
tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
|
||||||
|
tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
|
||||||
|
(int) t.max_seq_len);
|
||||||
|
|
||||||
|
cb_data.is_eval_pos = true;
|
||||||
|
success = get_hidden_layers(ctx, t.tokens_pos);
|
||||||
|
if (!success) break;
|
||||||
|
|
||||||
|
cb_data.is_eval_pos = false;
|
||||||
|
success = get_hidden_layers(ctx, t.tokens_neg);
|
||||||
|
if (!success) break;
|
||||||
|
|
||||||
|
// calculate diff and remove all zero rows
|
||||||
|
auto v_diff_filtered = cb_data.calc_diff();
|
||||||
|
|
||||||
|
// save & concat the filtered v_diff to ctx_train
|
||||||
|
ctx_train.concat_diff_tmp(v_diff_filtered);
|
||||||
|
|
||||||
|
// reset for next iteration
|
||||||
|
cb_data.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
// done with the model, we can now free it to make gain some memory
|
||||||
|
printf("Done evaluate prompts, unload model...\n");
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
// prepare ctx_train for PCA
|
||||||
|
ctx_train.build_v_diff();
|
||||||
|
|
||||||
|
// run PCA
|
||||||
|
PCA::pca_params pca_params;
|
||||||
|
pca_params.n_threads = params.n_threads;
|
||||||
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
|
||||||
|
// write output vectors to gguf
|
||||||
|
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
1
examples/cvector-generator/negative.txt
Normal file
1
examples/cvector-generator/negative.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
[INST] Act like a person who is extremely sad. [/INST]
|
322
examples/cvector-generator/pca.hpp
Normal file
322
examples/cvector-generator/pca.hpp
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <ctime>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
#define DEBUG_POS 5
|
||||||
|
|
||||||
|
static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
|
||||||
|
printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
|
||||||
|
if (!with_data) return;
|
||||||
|
printf("%s: %s[0] = [", __func__, t->name);
|
||||||
|
for (size_t i = 0; i <= DEBUG_POS; i++) {
|
||||||
|
printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
|
||||||
|
}
|
||||||
|
printf(" ... ]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace PCA {
|
||||||
|
|
||||||
|
// input params for PCA computations
|
||||||
|
struct pca_params {
|
||||||
|
int n_threads = 1;
|
||||||
|
int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
|
||||||
|
int n_iterations = 1000;
|
||||||
|
float tolerance = 1e-7;
|
||||||
|
|
||||||
|
// for debugging
|
||||||
|
int i_layer = 0;
|
||||||
|
int n_layers = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// result from each iteration
|
||||||
|
struct pca_result {
|
||||||
|
struct ggml_tensor * calculated_square = NULL;
|
||||||
|
std::vector<struct ggml_tensor *> eigenvectors;
|
||||||
|
std::vector<float> distances;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pca_model {
|
||||||
|
ggml_backend_t backend = NULL;
|
||||||
|
ggml_backend_buffer_t buffer;
|
||||||
|
struct ggml_context * ctx; // context to compute graph on target device
|
||||||
|
struct ggml_context * ctx_host; // host context to store results
|
||||||
|
|
||||||
|
// tensors on target device
|
||||||
|
struct ggml_tensor * dev_input;
|
||||||
|
struct ggml_tensor * dev_square;
|
||||||
|
struct ggml_tensor * dev_eigenvector;
|
||||||
|
|
||||||
|
pca_model(struct ggml_tensor * t_input) {
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||||
|
backend = ggml_backend_cuda_init(0); // init device 0
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// TODO: enable Metal support when support for GGML_OP_SQRT is added
|
||||||
|
// #ifdef GGML_USE_METAL
|
||||||
|
// fprintf(stderr, "%s: using Metal backend\n", __func__);
|
||||||
|
// backend = ggml_backend_metal_init();
|
||||||
|
// if (!backend) {
|
||||||
|
// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||||
|
// }
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
|
if (!backend) {
|
||||||
|
backend = ggml_backend_cpu_init();
|
||||||
|
}
|
||||||
|
|
||||||
|
const int num_tensors = 4;
|
||||||
|
struct ggml_init_params params {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx = ggml_init(params);
|
||||||
|
|
||||||
|
auto n_samples = t_input->ne[0];
|
||||||
|
auto n_embd = t_input->ne[1];
|
||||||
|
|
||||||
|
dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
|
||||||
|
dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||||
|
dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
ggml_set_name(dev_input, "dev_input");
|
||||||
|
ggml_set_name(dev_square, "dev_square");
|
||||||
|
ggml_set_name(dev_eigenvector, "dev_eigenvector");
|
||||||
|
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||||
|
ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
|
||||||
|
|
||||||
|
// initialize eigenvector to random normalized vector
|
||||||
|
{
|
||||||
|
std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
|
||||||
|
std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
|
||||||
|
std::uniform_real_distribution<float> distribution(0.0, 1.0);
|
||||||
|
float sum_sqr = 0.0; // for normalizing random_vec
|
||||||
|
for (size_t i = 0; i < random_vec.size(); ++i) {
|
||||||
|
float f = distribution(generator);
|
||||||
|
sum_sqr += f * f;
|
||||||
|
random_vec[i] = f;
|
||||||
|
}
|
||||||
|
// normalize it
|
||||||
|
float random_vec_norm = std::sqrt(sum_sqr);
|
||||||
|
for (size_t i = 0; i < random_vec.size(); ++i) {
|
||||||
|
random_vec[i] /= random_vec_norm;
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~pca_model() {
|
||||||
|
ggml_free(ctx);
|
||||||
|
ggml_backend_buffer_free(buffer);
|
||||||
|
ggml_backend_free(backend);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct ggml_cgraph * build_graph_piter(
|
||||||
|
const struct pca_params & params,
|
||||||
|
const pca_model & model,
|
||||||
|
bool calc_square = false) {
|
||||||
|
GGML_ASSERT(params.n_batch > 0);
|
||||||
|
// TODO: buf_size must be able to scale with params.n_batch
|
||||||
|
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
|
||||||
|
static std::vector<uint8_t> buf(buf_size);
|
||||||
|
|
||||||
|
struct ggml_init_params params0 = {
|
||||||
|
/*.mem_size =*/ buf_size,
|
||||||
|
/*.mem_buffer =*/ buf.data(),
|
||||||
|
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
|
||||||
|
};
|
||||||
|
// create a temporally context to build the graph
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params0);
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
// turn v_diff_original into square matrix if needed
|
||||||
|
struct ggml_tensor * tmp_square;
|
||||||
|
if (calc_square) {
|
||||||
|
tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
|
||||||
|
ggml_set_name(tmp_square, "tmp_square");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * b_tensor;
|
||||||
|
struct ggml_tensor * distance;
|
||||||
|
struct ggml_tensor * old_eigen = model.dev_eigenvector;
|
||||||
|
struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
|
||||||
|
|
||||||
|
for (int i = 0; i < params.n_batch; ++i) {
|
||||||
|
// b_tensor = square * eigenvector^T
|
||||||
|
b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
|
||||||
|
ggml_set_name(b_tensor, "b_tensor");
|
||||||
|
|
||||||
|
// normalize
|
||||||
|
b_tensor = ggml_div_inplace(ctx0,
|
||||||
|
b_tensor,
|
||||||
|
ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
|
||||||
|
);
|
||||||
|
ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
|
||||||
|
|
||||||
|
// calculate distance(new eigenvector - old eigenvector)
|
||||||
|
// we don't use ggml_sub because it may not be implemented on GPU backend
|
||||||
|
struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
|
||||||
|
distance = ggml_sqrt_inplace(ctx0,
|
||||||
|
ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
|
||||||
|
ggml_format_name(distance, "distance_%d", i);
|
||||||
|
|
||||||
|
old_eigen = b_tensor;
|
||||||
|
|
||||||
|
// build operations nodes
|
||||||
|
ggml_build_forward_expand(gf, distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete the temporally context used to build the graph
|
||||||
|
ggml_free(ctx0);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_status compute_piter(
|
||||||
|
const struct pca_params & params,
|
||||||
|
const pca_model & model,
|
||||||
|
struct ggml_cgraph * gf,
|
||||||
|
ggml_gallocr_t allocr,
|
||||||
|
struct pca_result & result) {
|
||||||
|
// allocate tensors
|
||||||
|
ggml_gallocr_alloc_graph(allocr, gf);
|
||||||
|
|
||||||
|
if (ggml_backend_is_cpu(model.backend)) {
|
||||||
|
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: enable GPU support when support for GGML_OP_SQRT is added
|
||||||
|
//#ifdef GGML_USE_METAL
|
||||||
|
// if (ggml_backend_is_metal(model.backend)) {
|
||||||
|
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
|
||||||
|
// }
|
||||||
|
//#endif
|
||||||
|
|
||||||
|
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
|
||||||
|
if (res == GGML_STATUS_SUCCESS) {
|
||||||
|
auto extract_i = [](std::string prefix, std::string str) -> int {
|
||||||
|
int i = -1;
|
||||||
|
if (str.rfind(prefix, 0) == 0) {
|
||||||
|
sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
};
|
||||||
|
result.calculated_square = NULL;
|
||||||
|
result.eigenvectors.clear();
|
||||||
|
result.distances.clear();
|
||||||
|
result.eigenvectors.resize(params.n_batch);
|
||||||
|
result.distances.resize(params.n_batch);
|
||||||
|
// get output nodes
|
||||||
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||||
|
auto node = gf->nodes[i];
|
||||||
|
int iter = -1;
|
||||||
|
// find b_tensor (without copying data from device)
|
||||||
|
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||||
|
result.eigenvectors[iter] = node;
|
||||||
|
}
|
||||||
|
// find distances, then copy data from device
|
||||||
|
if ((iter = extract_i("distance_", node->name)) > -1) {
|
||||||
|
float d;
|
||||||
|
ggml_backend_tensor_get(node, &d, 0, sizeof(float));
|
||||||
|
result.distances[iter] = d;
|
||||||
|
// std::cout << node->name << " = " << d << "\n";
|
||||||
|
}
|
||||||
|
// find tmp_square if it exists (without copying data from device)
|
||||||
|
if (std::string(node->name) == "tmp_square") {
|
||||||
|
result.calculated_square = node;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void power_iteration(
|
||||||
|
const struct pca_params & params,
|
||||||
|
struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
|
||||||
|
struct ggml_tensor * output) {
|
||||||
|
//printf("in power iteration\n");
|
||||||
|
struct pca_model model(input);
|
||||||
|
|
||||||
|
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
|
||||||
|
struct pca_result result;
|
||||||
|
struct ggml_tensor * last_eigenvector = NULL;
|
||||||
|
|
||||||
|
int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
|
||||||
|
for (int iter = 0; iter < n_iters; ++iter) {
|
||||||
|
bool calc_square = (iter == 0); // only need to calculate square for first iteration
|
||||||
|
struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
|
||||||
|
// ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
|
||||||
|
compute_piter(params, model, gf, allocr, result);
|
||||||
|
|
||||||
|
for (size_t k = 0; k < result.distances.size(); ++k) {
|
||||||
|
last_eigenvector = result.eigenvectors[k];
|
||||||
|
if (result.distances[k] < params.tolerance) {
|
||||||
|
break; // done
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (calc_square) {
|
||||||
|
// copy and store the square matrix if needed
|
||||||
|
GGML_ASSERT(result.calculated_square != NULL);
|
||||||
|
ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// copy last eigen vector and store as input for next iteration
|
||||||
|
GGML_ASSERT(last_eigenvector != NULL);
|
||||||
|
ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
||||||
|
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
// get output tensor
|
||||||
|
GGML_ASSERT(last_eigenvector);
|
||||||
|
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
||||||
|
//print_debug_tensor(output);
|
||||||
|
ggml_gallocr_free(allocr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void run_pca(
|
||||||
|
struct pca_params & params,
|
||||||
|
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
|
||||||
|
const std::vector<struct ggml_tensor *> & v_output) {
|
||||||
|
printf("%s: Running PCA...\n", __func__);
|
||||||
|
for (size_t il = 0; il < v_input.size(); ++il) {
|
||||||
|
|
||||||
|
// prepare output vector
|
||||||
|
struct ggml_tensor * ctrl_out = v_output[il];
|
||||||
|
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
||||||
|
|
||||||
|
// run power_iteration
|
||||||
|
params.i_layer = il;
|
||||||
|
params.n_layers = v_input.size();
|
||||||
|
power_iteration(params, v_input[il], ctrl_out);
|
||||||
|
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
1
examples/cvector-generator/positive.txt
Normal file
1
examples/cvector-generator/positive.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
[INST] Act like a person who is extremely happy. [/INST]
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET embedding)
|
set(TARGET llama-embedding)
|
||||||
add_executable(${TARGET} embedding.cpp)
|
add_executable(${TARGET} embedding.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
### Unix-based systems (Linux, macOS, etc.):
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows:
|
### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
set(TARGET eval-callback)
|
set(TARGET llama-eval-callback)
|
||||||
add_executable(${TARGET} eval-callback.cpp)
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
set(TEST_TARGET test-eval-callback)
|
set(TEST_TARGET test-eval-callback)
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||||
|
|
|
@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
eval-callback \
|
llama-eval-callback \
|
||||||
--hf-repo ggml-org/models \
|
--hf-repo ggml-org/models \
|
||||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
--model phi-2-q4_0.gguf \
|
--model phi-2-q4_0.gguf \
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET export-lora)
|
set(TARGET llama-export-lora)
|
||||||
add_executable(${TARGET} export-lora.cpp)
|
add_executable(${TARGET} export-lora.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
Apply LORA adapters to base model and export the resulting model.
|
Apply LORA adapters to base model and export the resulting model.
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: export-lora [options]
|
usage: llama-export-lora [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -17,7 +17,7 @@ options:
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/export-lora \
|
./bin/llama-export-lora \
|
||||||
-m open-llama-3b-v2-q8_0.gguf \
|
-m open-llama-3b-v2-q8_0.gguf \
|
||||||
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
||||||
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET finetune)
|
set(TARGET llama-finetune)
|
||||||
add_executable(${TARGET} finetune.cpp)
|
add_executable(${TARGET} finetune.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -7,7 +7,7 @@ Basic usage instructions:
|
||||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||||
|
|
||||||
# finetune LORA adapter
|
# finetune LORA adapter
|
||||||
./bin/finetune \
|
./bin/llama-finetune \
|
||||||
--model-base open-llama-3b-v2-q8_0.gguf \
|
--model-base open-llama-3b-v2-q8_0.gguf \
|
||||||
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
||||||
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
||||||
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
--use-checkpointing
|
--use-checkpointing
|
||||||
|
|
||||||
# predict
|
# predict
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||||
|
@ -38,14 +38,14 @@ After 10 more iterations:
|
||||||
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
||||||
|
|
||||||
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
||||||
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
|
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
|
||||||
|
|
||||||
In `main` you can also load multiple LORA adapters, which will then be mixed together.
|
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
|
||||||
|
|
||||||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||||
```
|
```
|
||||||
|
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
|
||||||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
cd ../..
|
cd ../..
|
||||||
|
|
||||||
EXE="./finetune"
|
EXE="./llama-finetune"
|
||||||
|
|
||||||
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
||||||
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET gbnf-validator)
|
set(TARGET llama-gbnf-validator)
|
||||||
add_executable(${TARGET} gbnf-validator.cpp)
|
add_executable(${TARGET} gbnf-validator.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -7,6 +7,8 @@
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <sstream>
|
||||||
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
@ -69,13 +71,14 @@ int main(int argc, char** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fseek(grammar_file, 0, SEEK_END);
|
std::string grammar_str;
|
||||||
size_t grammar_size = ftell(grammar_file);
|
{
|
||||||
fseek(grammar_file, 0, SEEK_SET);
|
std::ifstream grammar_file(grammar_filename);
|
||||||
|
GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
|
||||||
std::string grammar_str(grammar_size, ' ');
|
std::stringstream buffer;
|
||||||
fread(&grammar_str[0], 1, grammar_size, grammar_file);
|
buffer << grammar_file.rdbuf();
|
||||||
fclose(grammar_file);
|
grammar_str = buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the GBNF grammar
|
// Parse the GBNF grammar
|
||||||
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
||||||
|
@ -100,20 +103,15 @@ int main(int argc, char** argv) {
|
||||||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
|
|
||||||
// Read the input file
|
// Read the input file
|
||||||
FILE* input_file = fopen(input_filename.c_str(), "r");
|
std::string input_str;
|
||||||
if (!input_file) {
|
{
|
||||||
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
|
std::ifstream input_file(input_filename);
|
||||||
return 1;
|
GGML_ASSERT(input_file.is_open() && "Failed to open input file");
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << input_file.rdbuf();
|
||||||
|
input_str = buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
fseek(input_file, 0, SEEK_END);
|
|
||||||
size_t input_size = ftell(input_file);
|
|
||||||
fseek(input_file, 0, SEEK_SET);
|
|
||||||
|
|
||||||
std::string input_str(input_size, ' ');
|
|
||||||
fread(&input_str[0], 1, input_size, input_file);
|
|
||||||
fclose(input_file);
|
|
||||||
|
|
||||||
// Validate the input string against the grammar
|
// Validate the input string against the grammar
|
||||||
size_t error_pos;
|
size_t error_pos;
|
||||||
std::string error_msg;
|
std::string error_msg;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gguf-split)
|
set(TARGET llama-gguf-split)
|
||||||
add_executable(${TARGET} gguf-split.cpp)
|
add_executable(${TARGET} gguf-split.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -18,8 +18,8 @@ fi
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
SPLIT=$1/gguf-split
|
SPLIT=$1/llama-gguf-split
|
||||||
MAIN=$1/main
|
MAIN=$1/llama-cli
|
||||||
WORK_PATH=$TMP_DIR/gguf-split
|
WORK_PATH=$TMP_DIR/gguf-split
|
||||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gguf)
|
set(TARGET llama-gguf)
|
||||||
add_executable(${TARGET} gguf.cpp)
|
add_executable(${TARGET} gguf.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gritlm)
|
set(TARGET llama-gritlm)
|
||||||
add_executable(${TARGET} gritlm.cpp)
|
add_executable(${TARGET} gritlm.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
|
||||||
|
|
||||||
Run the example using the downloaded model:
|
Run the example using the downloaded model:
|
||||||
```console
|
```console
|
||||||
$ ./gritlm -m models/gritlm-7b_q4_1.gguf
|
$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
|
||||||
|
|
||||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
||||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET imatrix)
|
set(TARGET llama-imatrix)
|
||||||
add_executable(${TARGET} imatrix.cpp)
|
add_executable(${TARGET} imatrix.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```
|
```
|
||||||
./imatrix \
|
./llama-imatrix \
|
||||||
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
||||||
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
||||||
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
||||||
|
@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
LLAMA_CUDA=1 make -j
|
LLAMA_CUDA=1 make -j
|
||||||
|
|
||||||
# generate importance matrix (imatrix.dat)
|
# generate importance matrix (imatrix.dat)
|
||||||
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
|
||||||
# use the imatrix to perform a Q4_K_M quantization
|
# use the imatrix to perform a Q4_K_M quantization
|
||||||
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET infill)
|
set(TARGET llama-infill)
|
||||||
add_executable(${TARGET} infill.cpp)
|
add_executable(${TARGET} infill.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||||
```
|
```
|
||||||
|
|
|
@ -223,7 +223,11 @@ int main(int argc, char ** argv) {
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||||
embd_inp = inp_pfx;
|
embd_inp = inp_pfx;
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
embd_inp.push_back(llama_token_middle(model));
|
|
||||||
|
const llama_token middle_token = llama_token_middle(model);
|
||||||
|
if (middle_token >= 0) {
|
||||||
|
embd_inp.push_back(middle_token);
|
||||||
|
}
|
||||||
|
|
||||||
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
||||||
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
||||||
|
@ -528,7 +532,12 @@ int main(int argc, char ** argv) {
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||||
embd_inp = inp_pfx;
|
embd_inp = inp_pfx;
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
embd_inp.push_back(llama_token_middle(model));
|
|
||||||
|
const llama_token middle_token = llama_token_middle(model);
|
||||||
|
if (middle_token >= 0) {
|
||||||
|
embd_inp.push_back(middle_token);
|
||||||
|
}
|
||||||
|
|
||||||
embd.clear();
|
embd.clear();
|
||||||
n_remain = params.n_predict;
|
n_remain = params.n_predict;
|
||||||
n_past = 0;
|
n_past = 0;
|
||||||
|
|
|
@ -21,7 +21,7 @@ counter=1
|
||||||
echo 'Running'
|
echo 'Running'
|
||||||
while IFS= read -r question
|
while IFS= read -r question
|
||||||
do
|
do
|
||||||
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||||
echo $counter
|
echo $counter
|
||||||
echo "Current Question: $question"
|
echo "Current Question: $question"
|
||||||
eval "$exe_cmd"
|
eval "$exe_cmd"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Usage:
|
# Usage:
|
||||||
#! ./server -m some-model.gguf &
|
#! ./llama-server -m some-model.gguf &
|
||||||
#! pip install pydantic
|
#! pip install pydantic
|
||||||
#! python json-schema-pydantic-example.py
|
#! python json-schema-pydantic-example.py
|
||||||
|
|
||||||
|
|
|
@ -523,7 +523,7 @@ class SchemaConverter:
|
||||||
def main(args_in = None):
|
def main(args_in = None):
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='''
|
description='''
|
||||||
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
|
Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
|
||||||
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
||||||
added in the future.
|
added in the future.
|
||||||
''',
|
''',
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# llama.cpp/example/llama-bench
|
# llama.cpp/examples/llama-bench
|
||||||
|
|
||||||
Performance testing tool for llama.cpp.
|
Performance testing tool for llama.cpp.
|
||||||
|
|
||||||
|
|
|
@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
params.output_format = cmd_params_defaults.output_format;
|
params.output_format = cmd_params_defaults.output_format;
|
||||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||||
params.reps = cmd_params_defaults.reps;
|
params.reps = cmd_params_defaults.reps;
|
||||||
|
params.numa = cmd_params_defaults.numa;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
|
@ -713,7 +714,6 @@ struct test {
|
||||||
static const bool kompute;
|
static const bool kompute;
|
||||||
static const bool metal;
|
static const bool metal;
|
||||||
static const bool sycl;
|
static const bool sycl;
|
||||||
static const bool rpc;
|
|
||||||
static const bool gpu_blas;
|
static const bool gpu_blas;
|
||||||
static const bool blas;
|
static const bool blas;
|
||||||
static const std::string cpu_info;
|
static const std::string cpu_info;
|
||||||
|
@ -725,6 +725,7 @@ struct test {
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_ubatch;
|
int n_ubatch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
bool has_rpc;
|
||||||
ggml_type type_k;
|
ggml_type type_k;
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
|
@ -750,6 +751,7 @@ struct test {
|
||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_ubatch = inst.n_ubatch;
|
n_ubatch = inst.n_ubatch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
|
has_rpc = !inst.rpc_servers.empty();
|
||||||
type_k = inst.type_k;
|
type_k = inst.type_k;
|
||||||
type_v = inst.type_v;
|
type_v = inst.type_v;
|
||||||
n_gpu_layers = inst.n_gpu_layers;
|
n_gpu_layers = inst.n_gpu_layers;
|
||||||
|
@ -809,9 +811,6 @@ struct test {
|
||||||
if (sycl) {
|
if (sycl) {
|
||||||
return GGML_SYCL_NAME;
|
return GGML_SYCL_NAME;
|
||||||
}
|
}
|
||||||
if (rpc) {
|
|
||||||
return "RPC";
|
|
||||||
}
|
|
||||||
if (gpu_blas) {
|
if (gpu_blas) {
|
||||||
return "GPU BLAS";
|
return "GPU BLAS";
|
||||||
}
|
}
|
||||||
|
@ -881,7 +880,7 @@ struct test {
|
||||||
std::vector<std::string> values = {
|
std::vector<std::string> values = {
|
||||||
build_commit, std::to_string(build_number),
|
build_commit, std::to_string(build_number),
|
||||||
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
||||||
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||||
|
@ -915,7 +914,6 @@ const bool test::metal = !!ggml_cpu_has_metal();
|
||||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||||
const bool test::blas = !!ggml_cpu_has_blas();
|
const bool test::blas = !!ggml_cpu_has_blas();
|
||||||
const bool test::sycl = !!ggml_cpu_has_sycl();
|
const bool test::sycl = !!ggml_cpu_has_sycl();
|
||||||
const bool test::rpc = !!ggml_cpu_has_rpc();
|
|
||||||
const std::string test::cpu_info = get_cpu_info();
|
const std::string test::cpu_info = get_cpu_info();
|
||||||
const std::string test::gpu_info = get_gpu_info();
|
const std::string test::gpu_info = get_gpu_info();
|
||||||
|
|
||||||
|
@ -1181,6 +1179,9 @@ struct markdown_printer : public printer {
|
||||||
value = buf;
|
value = buf;
|
||||||
} else if (field == "backend") {
|
} else if (field == "backend") {
|
||||||
value = test::get_backend();
|
value = test::get_backend();
|
||||||
|
if (t.has_rpc) {
|
||||||
|
value += "+RPC";
|
||||||
|
}
|
||||||
} else if (field == "test") {
|
} else if (field == "test") {
|
||||||
if (t.n_prompt > 0 && t.n_gen == 0) {
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
||||||
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
||||||
|
|
|
@ -30,8 +30,9 @@ if(TARGET BUILD_INFO)
|
||||||
add_dependencies(llava BUILD_INFO)
|
add_dependencies(llava BUILD_INFO)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(TARGET llava-cli)
|
set(TARGET llama-llava-cli)
|
||||||
add_executable(llava-cli llava-cli.cpp)
|
add_executable(${TARGET} llava-cli.cpp)
|
||||||
install(TARGETS llava-cli RUNTIME)
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||||
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_features(llava PRIVATE cxx_std_11)
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
|
||||||
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
|
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Build with cmake or run `make llava-cli` to build it.
|
Build with cmake or run `make llama-llava-cli` to build it.
|
||||||
|
|
||||||
After building, run: `./llava-cli` to see the usage. For example:
|
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||||
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
||||||
--image path/to/an/image.jpg \
|
--image path/to/an/image.jpg \
|
||||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
||||||
|
@ -62,7 +62,7 @@ python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
|
||||||
|
|
||||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||||
```sh
|
```sh
|
||||||
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||||
```
|
```
|
||||||
|
|
||||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||||
|
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
||||||
### case 1
|
### case 1
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
/data/local/tmp/llava-cli \
|
/data/local/tmp/llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
-t 4 \
|
-t 4 \
|
||||||
|
@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
|
||||||
### case 2
|
### case 2
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
/data/local/tmp/llava-cli \
|
/data/local/tmp/llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
-t 4 \
|
-t 4 \
|
||||||
|
@ -126,7 +126,7 @@ llama_print_timings: total time = 34570.79 ms
|
||||||
#### llava-cli release-b2005
|
#### llava-cli release-b2005
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
/data/local/tmp/llava-cli \
|
/data/local/tmp/llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
-t 4 \
|
-t 4 \
|
||||||
|
@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
||||||
### case 1
|
### case 1
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
./llava-cli \
|
./llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
--image /data/local/tmp/demo.jpeg \
|
--image /data/local/tmp/demo.jpeg \
|
||||||
|
@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
|
||||||
### case 2
|
### case 2
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
./llava-cli \
|
./llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
||||||
|
|
|
@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
|
||||||
After API is confirmed, more models will be supported / uploaded.
|
After API is confirmed, more models will be supported / uploaded.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Build with cmake or run `make llava-cli` to build it.
|
Build with cmake or run `make llama-llava-cli` to build it.
|
||||||
|
|
||||||
After building, run: `./llava-cli` to see the usage. For example:
|
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||||
```
|
```
|
||||||
|
|
||||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||||
|
@ -95,9 +95,9 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
|
||||||
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||||
```
|
```
|
||||||
|
|
||||||
7) And finally we can run the llava-cli using the 1.6 model version:
|
7) And finally we can run the llava cli using the 1.6 model version:
|
||||||
```console
|
```console
|
||||||
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||||
```
|
```
|
||||||
|
|
||||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue