Merge remote-tracking branch 'origin/master' into json-bounds2
This commit is contained in:
commit
f03e9b935b
175 changed files with 3867 additions and 2668 deletions
|
@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
|||
stage('Running llama.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||
cat llama_log.txt # Printing results
|
||||
'''
|
||||
}
|
||||
|
|
|
@ -23,13 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
|||
# Enable CUDA
|
||||
ENV LLAMA_CUDA=1
|
||||
|
||||
RUN make -j$(nproc) main
|
||||
RUN make -j$(nproc) llama-cli
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/main /main
|
||||
COPY --from=build /app/llama-cli /llama-cli
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
26
.devops/llama-cli-intel.Dockerfile
Normal file
26
.devops/llama-cli-intel.Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
|
||||
ARG LLAMA_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||
echo "LLAMA_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
|
|||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN make -j$(nproc) main
|
||||
RUN make -j$(nproc) llama-cli
|
||||
|
||||
ENTRYPOINT [ "/app/main" ]
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
|||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
||||
cmake --build build --config Release --target main
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
# Clean up
|
||||
WORKDIR /
|
||||
RUN cp /app/build/bin/main /main && \
|
||||
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -9,15 +9,15 @@ WORKDIR /app
|
|||
|
||||
COPY . .
|
||||
|
||||
RUN make -j$(nproc) main
|
||||
RUN make -j$(nproc) llama-cli
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/main /main
|
||||
COPY --from=build /app/llama-cli /llama-cli
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1
|
|||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
||||
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
|||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
||||
ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
|||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llamaclblast
|
||||
%{_bindir}/llamaclblastserver
|
||||
%{_bindir}/llamaclblastsimple
|
||||
%{_bindir}/llama-clblast-cli
|
||||
%{_bindir}/llama-clblast-server
|
||||
%{_bindir}/llama-clblast-simple
|
||||
/usr/lib/systemd/system/llamaclblast.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
|
|
|
@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1
|
|||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llamacppcuda
|
||||
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
|||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
|
||||
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
|||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llamacppcuda
|
||||
%{_bindir}/llamacppcudaserver
|
||||
%{_bindir}/llamacppcudasimple
|
||||
%{_bindir}/llama-cuda-cli
|
||||
%{_bindir}/llama-cuda-server
|
||||
%{_bindir}/llama-cuda-simple
|
||||
/usr/lib/systemd/system/llamacuda.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
|
|
|
@ -38,9 +38,9 @@ make -j
|
|||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p main %{buildroot}%{_bindir}/llama
|
||||
cp -p server %{buildroot}%{_bindir}/llamaserver
|
||||
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||
|
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
|||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
||||
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
|
@ -69,9 +69,9 @@ rm -rf %{buildroot}
|
|||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llama
|
||||
%{_bindir}/llamaserver
|
||||
%{_bindir}/llamasimple
|
||||
%{_bindir}/llama-cli
|
||||
%{_bindir}/llama-server
|
||||
%{_bindir}/llama-simple
|
||||
/usr/lib/systemd/system/llama.service
|
||||
%config /etc/sysconfig/llama
|
||||
|
||||
|
|
|
@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
|
|||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
RUN make -j$(nproc) server
|
||||
RUN make -j$(nproc) llama-server
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||
|
||||
COPY --from=build /app/server /server
|
||||
COPY --from=build /app/llama-server /llama-server
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
||||
ENTRYPOINT [ "/llama-server" ]
|
29
.devops/llama-server-intel.Dockerfile
Normal file
29
.devops/llama-server-intel.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
|
||||
ARG LLAMA_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||
echo "LLAMA_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target llama-server
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
|||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
|
||||
RUN make -j$(nproc)
|
||||
RUN make -j$(nproc) llama-server
|
||||
|
||||
ENTRYPOINT [ "/app/server" ]
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -19,13 +19,13 @@ RUN apt-get update && \
|
|||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||
cmake --build build --config Release --target server
|
||||
cmake --build build --config Release --target llama-server
|
||||
|
||||
# Clean up
|
||||
WORKDIR /
|
||||
RUN cp /app/build/bin/server /server && \
|
||||
RUN cp /app/build/bin/llama-server /llama-server && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -11,15 +11,15 @@ COPY . .
|
|||
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
RUN make -j$(nproc) server
|
||||
RUN make -j$(nproc) llama-server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||
|
||||
COPY --from=build /app/server /server
|
||||
COPY --from=build /app/llama-server /llama-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
||||
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,34 +0,0 @@
|
|||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
ARG LLAMA_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||
echo "LLAMA_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target main
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/main /main
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
|
@ -6,11 +6,11 @@
|
|||
let
|
||||
inherit (config.packages) default;
|
||||
binaries = [
|
||||
"llama"
|
||||
"llama-cli"
|
||||
"llama-embedding"
|
||||
"llama-server"
|
||||
"quantize"
|
||||
"train-text-from-scratch"
|
||||
"llama-quantize"
|
||||
"llama-train-text-from-scratch"
|
||||
];
|
||||
mkApp = name: {
|
||||
type = "app";
|
||||
|
|
|
@ -243,8 +243,6 @@ effectiveStdenv.mkDerivation (
|
|||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||
# if they haven't been added yet.
|
||||
postInstall = ''
|
||||
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
|
||||
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
|
||||
mkdir -p $out/include
|
||||
cp $src/llama.h $out/include/
|
||||
'';
|
||||
|
@ -294,7 +292,7 @@ effectiveStdenv.mkDerivation (
|
|||
license = lib.licenses.mit;
|
||||
|
||||
# Accommodates `nix run` and `lib.getExe`
|
||||
mainProgram = "llama";
|
||||
mainProgram = "llama-cli";
|
||||
|
||||
# These people might respond, on the best effort basis, if you ping them
|
||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||
|
|
|
@ -1,45 +0,0 @@
|
|||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
ARG LLAMA_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||
echo "LLAMA_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target server
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
|
||||
COPY --from=build /app/build/bin/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
|
@ -10,11 +10,11 @@ shift
|
|||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
python3 ./convert-hf-to-gguf.py "$@"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./quantize "$@"
|
||||
./llama-quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
./main "$@"
|
||||
./llama-cli "$@"
|
||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||
./finetune "$@"
|
||||
./llama-finetune "$@"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
|
@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
|||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||
else
|
||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
fi
|
||||
done
|
||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||
./server "$@"
|
||||
./llama-server "$@"
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
|
|
|
@ -12,8 +12,8 @@ build*/
|
|||
|
||||
models/*
|
||||
|
||||
/main
|
||||
/quantize
|
||||
/llama-cli
|
||||
/llama-quantize
|
||||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
|
2
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
2
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
|||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
|
|
2
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
2
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
|||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
|
|
2
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
2
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
|||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
|
|
2
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
2
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
|
@ -24,7 +24,7 @@ body:
|
|||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
|
|
5
.github/pull_request_template.md
vendored
Normal file
5
.github/pull_request_template.md
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
- Self Reported Review Complexity:
|
||||
- [ ] Review Complexity : Low
|
||||
- [ ] Review Complexity : Medium
|
||||
- [ ] Review Complexity : High
|
||||
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
2
.github/workflows/bench.yml
vendored
2
.github/workflows/bench.yml
vendored
|
@ -119,7 +119,7 @@ jobs:
|
|||
-DLLAMA_FATAL_WARNINGS=OFF \
|
||||
-DLLAMA_ALL_WARNINGS=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release;
|
||||
cmake --build build --config Release -j $(nproc) --target server
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
|
||||
- name: Download the dataset
|
||||
id: download_dataset
|
||||
|
|
19
.github/workflows/build.yml
vendored
19
.github/workflows/build.yml
vendored
|
@ -13,7 +13,7 @@ on:
|
|||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
|
@ -103,12 +103,10 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
|
@ -241,8 +239,8 @@ jobs:
|
|||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
|
@ -684,7 +682,7 @@ jobs:
|
|||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2019
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
|
@ -829,7 +827,7 @@ jobs:
|
|||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2019
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -843,8 +841,9 @@ jobs:
|
|||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- uses: Jimver/cuda-toolkit@v0.2.11
|
||||
- name: Install CUDA toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.15
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
method: 'network'
|
||||
|
|
16
.github/workflows/docker.yml
vendored
16
.github/workflows/docker.yml
vendored
|
@ -30,20 +30,20 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||
# have disabled them for now until the reason why
|
||||
# is understood.
|
||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
|
10
.github/workflows/server.yml
vendored
10
.github/workflows/server.yml
vendored
|
@ -16,11 +16,9 @@ on:
|
|||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
pull_request_target:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
schedule:
|
||||
- cron: '2 4 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
|
@ -98,7 +96,7 @@ jobs:
|
|||
-DLLAMA_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
|
@ -115,7 +113,7 @@ jobs:
|
|||
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2019
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
@ -138,7 +136,7 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
|
|
43
.gitignore
vendored
43
.gitignore
vendored
|
@ -46,48 +46,9 @@ models/*
|
|||
models-mnt
|
||||
|
||||
/Pipfile
|
||||
/baby-llama
|
||||
/beam-search
|
||||
/benchmark-matmult
|
||||
/convert-llama2c-to-ggml
|
||||
/embd-input-test
|
||||
/embedding
|
||||
/eval-callback
|
||||
/gguf
|
||||
/gguf-llama-simple
|
||||
/gguf-split
|
||||
/gritlm
|
||||
/imatrix
|
||||
/infill
|
||||
/libllama.so
|
||||
/llama-bench
|
||||
/llava-cli
|
||||
/lookahead
|
||||
/lookup
|
||||
/lookup-create
|
||||
/lookup-merge
|
||||
/lookup-stats
|
||||
/main
|
||||
/metal
|
||||
/passkey
|
||||
/perplexity
|
||||
/q8dot
|
||||
/quantize
|
||||
/quantize-stats
|
||||
/result
|
||||
/save-load-state
|
||||
/server
|
||||
/simple
|
||||
/batched
|
||||
/batched-bench
|
||||
/export-lora
|
||||
/finetune
|
||||
/retrieval
|
||||
/speculative
|
||||
/parallel
|
||||
/train-text-from-scratch
|
||||
/tokenize
|
||||
/vdot
|
||||
/llama-*
|
||||
llama-batched-swift
|
||||
/common/build-info.cpp
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
|
|
@ -402,12 +402,26 @@ if (LLAMA_CUBLAS)
|
|||
endif()
|
||||
|
||||
if (LLAMA_CUDA)
|
||||
cmake_minimum_required(VERSION 3.17)
|
||||
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||
|
||||
find_package(CUDAToolkit)
|
||||
if (CUDAToolkit_FOUND)
|
||||
message(STATUS "CUDA found")
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# 52 == lowest CUDA 12 standard
|
||||
# 60 == f16 CUDA intrinsics
|
||||
# 61 == integer CUDA intrinsics
|
||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
enable_language(CUDA)
|
||||
|
||||
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
||||
|
@ -472,21 +486,6 @@ if (LLAMA_CUDA)
|
|||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# 52 == lowest CUDA 12 standard
|
||||
# 60 == f16 CUDA intrinsics
|
||||
# 61 == integer CUDA intrinsics
|
||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
else()
|
||||
message(WARNING "CUDA not found")
|
||||
endif()
|
||||
|
|
14
CONTRIBUTING.md
Normal file
14
CONTRIBUTING.md
Normal file
|
@ -0,0 +1,14 @@
|
|||
# Contributing Guidelines
|
||||
|
||||
## Checklist
|
||||
|
||||
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
|
||||
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||
* Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||
|
||||
## PR formatting
|
||||
|
||||
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
|
||||
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
|
||||
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
|
138
Makefile
138
Makefile
|
@ -1,8 +1,44 @@
|
|||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
||||
libllava.a \
|
||||
llama-baby-llama \
|
||||
llama-batched \
|
||||
llama-batched-bench \
|
||||
llama-bench \
|
||||
llama-benchmark-matmult \
|
||||
llama-cli \
|
||||
llama-convert-llama2c-to-ggml \
|
||||
llama-embedding \
|
||||
llama-eval-callback \
|
||||
llama-export-lora \
|
||||
llama-finetune \
|
||||
llama-gbnf-validator \
|
||||
llama-gguf \
|
||||
llama-gguf-split \
|
||||
llama-gritlm \
|
||||
llama-imatrix \
|
||||
llama-infill \
|
||||
llama-llava-cli \
|
||||
llama-lookahead \
|
||||
llama-lookup \
|
||||
llama-lookup-create \
|
||||
llama-lookup-merge \
|
||||
llama-lookup-stats \
|
||||
llama-parallel \
|
||||
llama-passkey \
|
||||
llama-perplexity \
|
||||
llama-q8dot \
|
||||
llama-quantize \
|
||||
llama-quantize-stats \
|
||||
llama-retrieval \
|
||||
llama-save-load-state \
|
||||
llama-server \
|
||||
llama-simple \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-train-text-from-scratch \
|
||||
llama-vdot \
|
||||
tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = \
|
||||
|
@ -777,7 +813,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
|||
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||
|
||||
clean:
|
||||
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -vrf ggml-cuda/*.o
|
||||
rm -vrf ggml-cuda/template-instances/*.o
|
||||
find examples pocs -type f -name "*.o" -delete
|
||||
|
@ -793,62 +829,62 @@ clean:
|
|||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||
|
||||
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo '==== Run ./llama-cli -h for help. ===='
|
||||
@echo
|
||||
|
||||
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
|
||||
|
@ -861,23 +897,23 @@ examples/server/%.hpp: examples/server/public/% Makefile
|
|||
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
||||
) > $@
|
||||
|
||||
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||
llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
|
@ -888,55 +924,61 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS)
|
|||
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||
|
||||
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
||||
|
||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
||||
llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
||||
|
||||
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
|
@ -962,20 +1004,20 @@ build-info.o: common/build-info.cpp
|
|||
|
||||
tests: $(TEST_TARGETS)
|
||||
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||
llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
run-benchmark-matmult: benchmark-matmult
|
||||
run-benchmark-matmult: llama-benchmark-matmult
|
||||
./$@
|
||||
|
||||
.PHONY: run-benchmark-matmult swift
|
||||
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||
llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
|||
*Notes:*
|
||||
|
||||
- **Memory**
|
||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
|
||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||
|
||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||
|
||||
|
@ -99,14 +99,14 @@ The docker build option is currently limited to *intel GPU* targets.
|
|||
### Build image
|
||||
```sh
|
||||
# Using FP16
|
||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||
```
|
||||
|
||||
*Notes*:
|
||||
|
||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
||||
|
||||
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||
|
||||
### Run container
|
||||
|
||||
|
@ -275,7 +275,7 @@ source /opt/intel/oneapi/setvars.sh
|
|||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||
|
||||
```sh
|
||||
./build/bin/ls-sycl-device
|
||||
./build/bin/llama-ls-sycl-device
|
||||
```
|
||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
||||
```
|
||||
|
@ -313,7 +313,7 @@ Examples:
|
|||
- Use device 0:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
```
|
||||
or run by script:
|
||||
|
||||
|
@ -324,7 +324,7 @@ or run by script:
|
|||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
```
|
||||
|
||||
Otherwise, you can run the script:
|
||||
|
@ -427,7 +427,7 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
|||
|
||||
*Notes:*
|
||||
|
||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
|
||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
|
||||
|
||||
### III. Run the inference
|
||||
|
||||
|
@ -488,13 +488,13 @@ Examples:
|
|||
- Use device 0:
|
||||
|
||||
```
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
```
|
||||
Otherwise, run the following wrapper script:
|
||||
|
||||
|
|
66
README.md
66
README.md
|
@ -10,6 +10,9 @@
|
|||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
> [!IMPORTANT]
|
||||
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||
|
||||
### Recent API changes
|
||||
|
||||
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
||||
|
@ -53,7 +56,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|||
<li><a href="#quantization">Quantization</a></li>
|
||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
||||
<li><a href="#instruct-mode">Instruct mode</a></li>
|
||||
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
|
||||
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
|
||||
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
|
||||
|
@ -218,7 +220,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||
|
||||
```
|
||||
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||
I llama.cpp build info:
|
||||
I UNAME_S: Darwin
|
||||
I UNAME_P: arm
|
||||
|
@ -556,7 +558,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
|
||||
```sh
|
||||
# Build the image
|
||||
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
|
||||
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
|
||||
|
||||
# Then, use it:
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
|
@ -577,7 +579,9 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
vulkaninfo
|
||||
```
|
||||
|
||||
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||
Alternatively your package manager might be able to provide the appropriate libraries.
|
||||
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||
|
||||
Then, build llama.cpp using the cmake command below:
|
||||
|
||||
|
@ -585,7 +589,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
cmake -B build -DLLAMA_VULKAN=1
|
||||
cmake --build build --config Release
|
||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||
|
||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||
|
@ -622,17 +626,17 @@ python3 convert-hf-to-gguf.py models/mymodel/
|
|||
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
|
||||
|
||||
# quantize the model to 4-bits (using Q4_K_M method)
|
||||
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
|
||||
# update the gguf filetype to current version if older version is now unsupported
|
||||
./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
||||
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
||||
```
|
||||
|
||||
### Run the quantized model
|
||||
|
||||
```bash
|
||||
# start inference on a gguf model
|
||||
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
||||
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
||||
```
|
||||
|
||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||
|
@ -707,7 +711,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
|
|||
#### How to run
|
||||
|
||||
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||
2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||
3. Output:
|
||||
```
|
||||
perplexity : calculating perplexity over 655 chunks
|
||||
|
@ -731,16 +735,16 @@ Here is an example of a few-shot interaction, invoked with the command
|
|||
./examples/chat-13B.sh
|
||||
|
||||
# custom arguments using a 13B model
|
||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
```
|
||||
|
||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
||||
|
||||

|
||||
|
||||
### Persistent Interaction
|
||||
|
||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||
|
||||
```bash
|
||||
# Start a new chat
|
||||
|
@ -762,41 +766,13 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
|||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||
|
||||
```bash
|
||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||
```
|
||||
|
||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||
|
||||
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
||||
|
||||
### Instruct mode
|
||||
|
||||
1. First, download and place the `ggml` model into the `./models` folder
|
||||
2. Run the `main` tool like this:
|
||||
|
||||
```
|
||||
./examples/alpaca.sh
|
||||
```
|
||||
|
||||
Sample run:
|
||||
|
||||
```
|
||||
== Running in interactive mode. ==
|
||||
- Press Ctrl+C to interject at any time.
|
||||
- Press Return to return control to LLaMA.
|
||||
- If you want to submit another line, end your input in '\'.
|
||||
|
||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
> How many letters are there in the English alphabet?
|
||||
There 26 letters in the English Alphabet
|
||||
> What is the most common way of transportation in Amsterdam?
|
||||
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
||||
> List 5 words that start with "ca".
|
||||
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||
>
|
||||
```
|
||||
|
||||
### Obtaining and using the Facebook LLaMA 2 model
|
||||
|
||||
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
|
||||
|
@ -869,7 +845,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
|
|||
Now, you can start chatting:
|
||||
```
|
||||
$cd /data/data/com.termux/files/home/bin
|
||||
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||
```
|
||||
|
||||
Here's a demo of an interactive session running on Pixel 5 phone:
|
||||
|
@ -936,8 +912,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
|||
|
||||
```bash
|
||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
|
||||
```
|
||||
|
||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||
|
@ -987,7 +963,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
|
|||
|
||||
### Docs
|
||||
|
||||
- [main](./examples/main/README.md)
|
||||
- [main (cli)](./examples/main/README.md)
|
||||
- [server](./examples/server/README.md)
|
||||
- [jeopardy](./examples/jeopardy/README.md)
|
||||
- [BLIS](./docs/BLIS.md)
|
||||
|
|
224
ci/run.sh
224
ci/run.sh
|
@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
|
|||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -437,45 +437,45 @@ function gg_run_pythia_1_4b {
|
|||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -569,47 +569,47 @@ function gg_run_pythia_2_8b {
|
|||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -693,10 +693,10 @@ function gg_run_embd_bge_small {
|
|||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
|
||||
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
|
|
@ -200,19 +200,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
|
|||
}
|
||||
params.hf_file = params.model;
|
||||
} else if (params.model.empty()) {
|
||||
std::string cache_directory = fs_get_cache_directory();
|
||||
const bool success = fs_create_directory_with_parents(cache_directory);
|
||||
if (!success) {
|
||||
throw std::runtime_error("failed to create cache directory: " + cache_directory);
|
||||
}
|
||||
params.model = cache_directory + string_split(params.hf_file, '/').back();
|
||||
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
|
||||
}
|
||||
} else if (!params.model_url.empty()) {
|
||||
if (params.model.empty()) {
|
||||
auto f = string_split(params.model_url, '#').front();
|
||||
f = string_split(f, '?').front();
|
||||
f = string_split(f, '/').back();
|
||||
params.model = "models/" + f;
|
||||
params.model = fs_get_cache_file(string_split(f, '/').back());
|
||||
}
|
||||
} else if (params.model.empty()) {
|
||||
params.model = DEFAULT_MODEL_PATH;
|
||||
|
@ -2279,6 +2273,16 @@ std::string fs_get_cache_directory() {
|
|||
return ensure_trailing_slash(cache_directory);
|
||||
}
|
||||
|
||||
std::string fs_get_cache_file(const std::string & filename) {
|
||||
GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
|
||||
std::string cache_directory = fs_get_cache_directory();
|
||||
const bool success = fs_create_directory_with_parents(cache_directory);
|
||||
if (!success) {
|
||||
throw std::runtime_error("failed to create cache directory: " + cache_directory);
|
||||
}
|
||||
return cache_directory + filename;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Model utils
|
||||
|
|
|
@ -277,6 +277,7 @@ bool fs_validate_filename(const std::string & filename);
|
|||
bool fs_create_directory_with_parents(const std::string & path);
|
||||
|
||||
std::string fs_get_cache_directory();
|
||||
std::string fs_get_cache_file(const std::string & filename);
|
||||
|
||||
//
|
||||
// Model utils
|
||||
|
|
|
@ -263,7 +263,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|||
throw std::runtime_error("At least one of min_value or max_value must be set");
|
||||
}
|
||||
|
||||
const std::string SPACE_RULE = "\" \"?";
|
||||
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||
|
||||
struct BuiltinRule {
|
||||
std::string content;
|
||||
|
@ -280,7 +280,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
|||
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
||||
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
||||
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
|
||||
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
|
||||
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
|
||||
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
||||
{"null", {"\"null\" space", {}}},
|
||||
};
|
||||
|
|
|
@ -47,11 +47,12 @@ class Model:
|
|||
_model_classes: dict[str, type[Model]] = {}
|
||||
|
||||
dir_model: Path
|
||||
ftype: int
|
||||
ftype: gguf.LlamaFileType
|
||||
is_big_endian: bool
|
||||
endianess: gguf.GGUFEndian
|
||||
use_temp_file: bool
|
||||
lazy: bool
|
||||
model_name: str | None
|
||||
part_names: list[str]
|
||||
is_safetensors: bool
|
||||
hparams: dict[str, Any]
|
||||
|
@ -64,7 +65,7 @@ class Model:
|
|||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
|
||||
if type(self) is Model:
|
||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||
self.dir_model = dir_model
|
||||
|
@ -73,10 +74,11 @@ class Model:
|
|||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.use_temp_file = use_temp_file
|
||||
self.lazy = not eager
|
||||
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
||||
self.model_name = model_name
|
||||
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
|
||||
self.is_safetensors = len(self.part_names) > 0
|
||||
if not self.is_safetensors:
|
||||
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||
self.hparams = Model.load_hparams(self.dir_model)
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
@ -94,7 +96,7 @@ class Model:
|
|||
ftype_lw: str = ftype_up.lower()
|
||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
||||
|
||||
@classmethod
|
||||
def __init_subclass__(cls):
|
||||
|
@ -182,7 +184,7 @@ class Model:
|
|||
return new_name
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
||||
|
@ -324,21 +326,21 @@ class Model:
|
|||
|
||||
def write(self):
|
||||
self.write_tensors()
|
||||
self.gguf_writer.write_header_to_file()
|
||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||
self.gguf_writer.write_kv_data_to_file()
|
||||
self.gguf_writer.write_tensors_to_file(progress=True)
|
||||
self.gguf_writer.close()
|
||||
|
||||
def write_vocab(self):
|
||||
self.gguf_writer.write_header_to_file()
|
||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||
self.gguf_writer.write_kv_data_to_file()
|
||||
self.gguf_writer.close()
|
||||
|
||||
@staticmethod
|
||||
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
|
||||
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
||||
part_names: list[str] = []
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.endswith(suffix):
|
||||
if filename.startswith(prefix) and filename.endswith(suffix):
|
||||
part_names.append(filename)
|
||||
|
||||
part_names.sort()
|
||||
|
@ -665,7 +667,7 @@ class GPTNeoXModel(Model):
|
|||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -798,7 +800,7 @@ class MPTModel(Model):
|
|||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layers"]
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -850,7 +852,7 @@ class OrionModel(Model):
|
|||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
self.gguf_writer.add_context_length(ctx_length)
|
||||
|
@ -887,7 +889,7 @@ class BaichuanModel(Model):
|
|||
else:
|
||||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
self.gguf_writer.add_context_length(ctx_length)
|
||||
|
@ -1010,7 +1012,7 @@ class XverseModel(Model):
|
|||
else:
|
||||
raise ValueError("gguf: can not find ctx length parameter.")
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
self.gguf_writer.add_context_length(ctx_length)
|
||||
|
@ -1206,7 +1208,7 @@ class StableLMModel(Model):
|
|||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -1681,7 +1683,7 @@ class GPT2Model(Model):
|
|||
model_arch = gguf.MODEL_ARCH.GPT2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
|
@ -2248,7 +2250,7 @@ class GemmaModel(Model):
|
|||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
|
@ -2348,7 +2350,7 @@ class MambaModel(Model):
|
|||
# Fail early for models which don't have a block expansion factor of 2
|
||||
assert d_inner == 2 * d_model
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
||||
self.gguf_writer.add_embedding_length(d_model)
|
||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||
|
@ -2852,7 +2854,7 @@ def main() -> None:
|
|||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
|
||||
|
||||
logger.info("Set model parameters")
|
||||
model_instance.set_gguf_parameters()
|
||||
|
|
|
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
|
|||
|
||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||
|
||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
||||
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
|
||||
|
||||
## GGUF specification
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
## Verifying that the model is running on the GPU with CUDA
|
||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||
```shell
|
||||
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||
```
|
||||
|
||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||
|
@ -27,7 +27,7 @@ RAM: 32GB
|
|||
|
||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
||||
|
||||
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||
|
||||
Result:
|
||||
|
||||
|
|
|
@ -13,42 +13,43 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(gbnf-validator)
|
||||
add_subdirectory(gguf-split)
|
||||
add_subdirectory(gguf)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(imatrix)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
if (LLAMA_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(retrieval)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(gguf)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
add_subdirectory(imatrix)
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
if (LLAMA_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
if (LLAMA_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
endif()
|
||||
|
|
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
|||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||
fi
|
||||
|
||||
./main "${GEN_OPTIONS[@]}" \
|
||||
./llama-cli "${GEN_OPTIONS[@]}" \
|
||||
--model "$MODEL" \
|
||||
--in-prefix " " \
|
||||
--in-suffix "${AI_NAME}:" \
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||
--color \
|
||||
-f ./prompts/alpaca.txt \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 7
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET baby-llama)
|
||||
set(TARGET llama-baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -58,4 +58,4 @@ echo "$2
|
|||
model=$1
|
||||
|
||||
# generate the most likely continuation until the string "===" is found
|
||||
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET batched-bench)
|
||||
set(TARGET llama-batched-bench)
|
||||
add_executable(${TARGET} batched-bench.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
|||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||
|
||||
```bash
|
||||
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||
|
||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||
|
||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||
|
||||
# custom set of batches
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||
```
|
||||
|
||||
## Sample results
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
.PHONY: build
|
||||
|
||||
build:
|
||||
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||
rm -f ./batched_swift
|
||||
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
||||
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||
rm -f ./llama-batched-swift
|
||||
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "batched_swift",
|
||||
name: "llama-batched-swift",
|
||||
platforms: [.macOS(.v12)],
|
||||
dependencies: [
|
||||
.package(name: "llama", path: "../../"),
|
||||
|
@ -13,7 +13,7 @@ let package = Package(
|
|||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||
// Targets can depend on other targets in this package and products from dependencies.
|
||||
.executableTarget(
|
||||
name: "batched_swift",
|
||||
name: "llama-batched-swift",
|
||||
dependencies: ["llama"],
|
||||
path: "Sources",
|
||||
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
This is a swift clone of `examples/batched`.
|
||||
|
||||
$ `make`
|
||||
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||
$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET batched)
|
||||
set(TARGET llama-batched)
|
||||
add_executable(${TARGET} batched.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
The example demonstrates batched generation from a given prompt
|
||||
|
||||
```bash
|
||||
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||
|
||||
...
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET benchmark)
|
||||
set(TARGET llama-bench-matmult)
|
||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
|||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
./main $GEN_OPTIONS \
|
||||
./llama-cli $GEN_OPTIONS \
|
||||
--model "$MODEL" \
|
||||
--threads "$N_THREAD" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
|
|
|
@ -62,7 +62,7 @@ fi
|
|||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||
echo 'Prompt cache does not exist, building...'
|
||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||
./main 2>>"$LOG" \
|
||||
./llama-cli 2>>"$LOG" \
|
||||
--batch_size 64 \
|
||||
"${OPTS[@]}" \
|
||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||
|
@ -109,13 +109,13 @@ while read -e line; do
|
|||
|
||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||
|
||||
./main 2>>"$LOG" "${OPTS[@]}" \
|
||||
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
|
||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||
--prompt-cache-all \
|
||||
--file "$CUR_PROMPT_FILE" \
|
||||
--reverse-prompt "${USER_NAME}:" \
|
||||
--n_predict "$n_predict" |
|
||||
skip_bytes 1 | # skip BOS token added by ./main
|
||||
skip_bytes 1 | # skip BOS token added by ./llama-cli
|
||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||
skip_bytes "$n_prompt_len_pre" # print generation
|
||||
|
||||
|
@ -133,7 +133,7 @@ while read -e line; do
|
|||
# TODO get both messages in one go
|
||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
||||
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -144,7 +144,7 @@ while read -e line; do
|
|||
fi
|
||||
|
||||
# Update cache for next prompt in background, ideally during user input
|
||||
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||
--file "$NEXT_PROMPT_FILE" \
|
||||
--n_predict 1 &
|
||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
|||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
./bin/main $GEN_OPTIONS \
|
||||
./bin/llama-cli $GEN_OPTIONS \
|
||||
--model "$MODEL" \
|
||||
--threads "$N_THREAD" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
|
|
|
@ -11,6 +11,6 @@ cd ..
|
|||
#
|
||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||
#
|
||||
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||
--repeat_penalty 1.0 --color -i \
|
||||
-r "User:" -f prompts/chat-with-bob.txt
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET convert-llama2c-to-ggml)
|
||||
set(TARGET llama-convert-llama2c-to-ggml)
|
||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
|
|||
|
||||
After successful compilation, following usage options are available:
|
||||
```
|
||||
usage: ./convert-llama2c-to-ggml [options]
|
||||
usage: ./llama-convert-llama2c-to-ggml [options]
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
|
@ -19,10 +19,10 @@ options:
|
|||
|
||||
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||
|
||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||
`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||
|
||||
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
||||
|
||||
Now you can use the model with a command like:
|
||||
|
||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET embedding)
|
||||
set(TARGET llama-embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
|
|||
### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||
```
|
||||
|
||||
### Windows:
|
||||
|
||||
```powershell
|
||||
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||
```
|
||||
|
||||
The above command will output space-separated float values.
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
set(TARGET eval-callback)
|
||||
set(TARGET llama-eval-callback)
|
||||
add_executable(${TARGET} eval-callback.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TEST_TARGET test-eval-callback)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||
|
|
|
@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
|
|||
Usage:
|
||||
|
||||
```shell
|
||||
eval-callback \
|
||||
llama-eval-callback \
|
||||
--hf-repo ggml-org/models \
|
||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||
--model phi-2-q4_0.gguf \
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET export-lora)
|
||||
set(TARGET llama-export-lora)
|
||||
add_executable(${TARGET} export-lora.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
Apply LORA adapters to base model and export the resulting model.
|
||||
|
||||
```
|
||||
usage: export-lora [options]
|
||||
usage: llama-export-lora [options]
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
|
@ -17,7 +17,7 @@ options:
|
|||
For example:
|
||||
|
||||
```bash
|
||||
./bin/export-lora \
|
||||
./bin/llama-export-lora \
|
||||
-m open-llama-3b-v2-q8_0.gguf \
|
||||
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
||||
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET finetune)
|
||||
set(TARGET llama-finetune)
|
||||
add_executable(${TARGET} finetune.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -7,7 +7,7 @@ Basic usage instructions:
|
|||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||
|
||||
# finetune LORA adapter
|
||||
./bin/finetune \
|
||||
./bin/llama-finetune \
|
||||
--model-base open-llama-3b-v2-q8_0.gguf \
|
||||
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
||||
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
||||
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
|||
--use-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
```
|
||||
|
||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||
|
@ -38,14 +38,14 @@ After 10 more iterations:
|
|||
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
||||
|
||||
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
||||
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
|
||||
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
|
||||
|
||||
In `main` you can also load multiple LORA adapters, which will then be mixed together.
|
||||
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
|
||||
|
||||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||
|
||||
```bash
|
||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||
```
|
||||
|
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
|
|||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||
|
||||
```bash
|
||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
cd `dirname $0`
|
||||
cd ../..
|
||||
|
||||
EXE="./finetune"
|
||||
EXE="./llama-finetune"
|
||||
|
||||
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
||||
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
set(TARGET gbnf-validator)
|
||||
set(TARGET llama-gbnf-validator)
|
||||
add_executable(${TARGET} gbnf-validator.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -7,6 +7,8 @@
|
|||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
@ -69,13 +71,14 @@ int main(int argc, char** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
fseek(grammar_file, 0, SEEK_END);
|
||||
size_t grammar_size = ftell(grammar_file);
|
||||
fseek(grammar_file, 0, SEEK_SET);
|
||||
|
||||
std::string grammar_str(grammar_size, ' ');
|
||||
fread(&grammar_str[0], 1, grammar_size, grammar_file);
|
||||
fclose(grammar_file);
|
||||
std::string grammar_str;
|
||||
{
|
||||
std::ifstream grammar_file(grammar_filename);
|
||||
GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
|
||||
std::stringstream buffer;
|
||||
buffer << grammar_file.rdbuf();
|
||||
grammar_str = buffer.str();
|
||||
}
|
||||
|
||||
// Parse the GBNF grammar
|
||||
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
||||
|
@ -100,20 +103,15 @@ int main(int argc, char** argv) {
|
|||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
|
||||
// Read the input file
|
||||
FILE* input_file = fopen(input_filename.c_str(), "r");
|
||||
if (!input_file) {
|
||||
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
|
||||
return 1;
|
||||
std::string input_str;
|
||||
{
|
||||
std::ifstream input_file(input_filename);
|
||||
GGML_ASSERT(input_file.is_open() && "Failed to open input file");
|
||||
std::stringstream buffer;
|
||||
buffer << input_file.rdbuf();
|
||||
input_str = buffer.str();
|
||||
}
|
||||
|
||||
fseek(input_file, 0, SEEK_END);
|
||||
size_t input_size = ftell(input_file);
|
||||
fseek(input_file, 0, SEEK_SET);
|
||||
|
||||
std::string input_str(input_size, ' ');
|
||||
fread(&input_str[0], 1, input_size, input_file);
|
||||
fclose(input_file);
|
||||
|
||||
// Validate the input string against the grammar
|
||||
size_t error_pos;
|
||||
std::string error_msg;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET gguf-split)
|
||||
set(TARGET llama-gguf-split)
|
||||
add_executable(${TARGET} gguf-split.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -18,8 +18,8 @@ fi
|
|||
|
||||
set -x
|
||||
|
||||
SPLIT=$1/gguf-split
|
||||
MAIN=$1/main
|
||||
SPLIT=$1/llama-gguf-split
|
||||
MAIN=$1/llama-cli
|
||||
WORK_PATH=$TMP_DIR/gguf-split
|
||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET gguf)
|
||||
set(TARGET llama-gguf)
|
||||
add_executable(${TARGET} gguf.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET gritlm)
|
||||
set(TARGET llama-gritlm)
|
||||
add_executable(${TARGET} gritlm.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
|
|||
|
||||
Run the example using the downloaded model:
|
||||
```console
|
||||
$ ./gritlm -m models/gritlm-7b_q4_1.gguf
|
||||
$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
|
||||
|
||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET imatrix)
|
||||
set(TARGET llama-imatrix)
|
||||
add_executable(${TARGET} imatrix.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
|
|||
## Usage
|
||||
|
||||
```
|
||||
./imatrix \
|
||||
./llama-imatrix \
|
||||
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
||||
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
||||
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
||||
|
@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
|||
LLAMA_CUDA=1 make -j
|
||||
|
||||
# generate importance matrix (imatrix.dat)
|
||||
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||
|
||||
# use the imatrix to perform a Q4_K_M quantization
|
||||
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||
./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||
```
|
||||
|
|
|
@ -218,20 +218,64 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||
fname += std::to_string(ncall);
|
||||
}
|
||||
|
||||
// avoid writing imatrix entries that do not have full data
|
||||
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
|
||||
|
||||
int n_entries = 0;
|
||||
std::vector<std::string> to_store;
|
||||
|
||||
bool is_first = true; // for printing
|
||||
for (const auto & kv : m_stats) {
|
||||
const int n_all = kv.second.counts.size();
|
||||
|
||||
if (n_all == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int n_zeros = 0;
|
||||
for (const int c : kv.second.counts) {
|
||||
if (c == 0) {
|
||||
n_zeros++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_zeros != 0 && is_first) {
|
||||
fprintf(stderr, "\n");
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
if (n_zeros == n_all) {
|
||||
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (n_zeros > 0) {
|
||||
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||
continue;
|
||||
}
|
||||
|
||||
n_entries++;
|
||||
to_store.push_back(kv.first);
|
||||
}
|
||||
|
||||
if (to_store.size() < m_stats.size()) {
|
||||
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
||||
}
|
||||
|
||||
std::ofstream out(fname, std::ios::binary);
|
||||
int n_entries = m_stats.size();
|
||||
out.write((const char *) &n_entries, sizeof(n_entries));
|
||||
for (const auto & p : m_stats) {
|
||||
int len = p.first.size();
|
||||
for (const auto & name : to_store) {
|
||||
const auto & stat = m_stats.at(name);
|
||||
int len = name.size();
|
||||
out.write((const char *) &len, sizeof(len));
|
||||
out.write(p.first.c_str(), len);
|
||||
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
||||
int nval = p.second.values.size();
|
||||
out.write(name.c_str(), len);
|
||||
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
|
||||
int nval = stat.values.size();
|
||||
out.write((const char *) &nval, sizeof(nval));
|
||||
if (nval > 0) {
|
||||
std::vector<float> tmp(nval);
|
||||
for (int i = 0; i < nval; i++) {
|
||||
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
|
||||
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
|
||||
}
|
||||
out.write((const char*)tmp.data(), nval*sizeof(float));
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET infill)
|
||||
set(TARGET llama-infill)
|
||||
add_executable(${TARGET} infill.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu
|
|||
```
|
||||
|
||||
```bash
|
||||
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||
```
|
||||
|
|
|
@ -21,7 +21,7 @@ counter=1
|
|||
echo 'Running'
|
||||
while IFS= read -r question
|
||||
do
|
||||
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||
echo $counter
|
||||
echo "Current Question: $question"
|
||||
eval "$exe_cmd"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Usage:
|
||||
#! ./server -m some-model.gguf &
|
||||
#! ./llama-server -m some-model.gguf &
|
||||
#! pip install pydantic
|
||||
#! python json-schema-pydantic-example.py
|
||||
|
||||
|
|
|
@ -193,9 +193,8 @@ class BuiltinRule:
|
|||
self.content = content
|
||||
self.deps = deps or []
|
||||
|
||||
# whitespace is constrained to a single space char to prevent model "running away" in
|
||||
# whitespace. Also maybe improves generation quality?
|
||||
SPACE_RULE = '" "?'
|
||||
# Constraining spaces to prevent model "running away".
|
||||
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
|
||||
|
||||
PRIMITIVE_RULES = {
|
||||
'boolean' : BuiltinRule('("true" | "false") space', []),
|
||||
|
@ -207,7 +206,7 @@ PRIMITIVE_RULES = {
|
|||
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
|
||||
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
|
||||
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
|
||||
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
|
||||
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
|
||||
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
|
||||
'null' : BuiltinRule('"null" space', []),
|
||||
}
|
||||
|
@ -706,7 +705,7 @@ class SchemaConverter:
|
|||
def main(args_in = None):
|
||||
parser = argparse.ArgumentParser(
|
||||
description='''
|
||||
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
|
||||
Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
|
||||
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
||||
added in the future.
|
||||
''',
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# llama.cpp/example/llama-bench
|
||||
# llama.cpp/examples/llama-bench
|
||||
|
||||
Performance testing tool for llama.cpp.
|
||||
|
||||
|
|
|
@ -1033,6 +1033,27 @@ struct markdown_printer : public printer {
|
|||
if (field == "n_gpu_layers") {
|
||||
return 3;
|
||||
}
|
||||
if (field == "n_threads") {
|
||||
return 7;
|
||||
}
|
||||
if (field == "n_batch") {
|
||||
return 7;
|
||||
}
|
||||
if (field == "n_ubatch") {
|
||||
return 8;
|
||||
}
|
||||
if (field == "type_k" || field == "type_v") {
|
||||
return 6;
|
||||
}
|
||||
if (field == "split_mode") {
|
||||
return 5;
|
||||
}
|
||||
if (field == "flash_attn") {
|
||||
return 2;
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return 4;
|
||||
}
|
||||
if (field == "test") {
|
||||
return 13;
|
||||
}
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 8
|
|
@ -1,18 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 8
|
|
@ -30,8 +30,9 @@ if(TARGET BUILD_INFO)
|
|||
add_dependencies(llava BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET llava-cli)
|
||||
add_executable(llava-cli llava-cli.cpp)
|
||||
install(TARGETS llava-cli RUNTIME)
|
||||
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(llava PRIVATE cxx_std_11)
|
||||
set(TARGET llama-llava-cli)
|
||||
add_executable(${TARGET} llava-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
|
|||
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llava-cli` to build it.
|
||||
Build with cmake or run `make llama-llava-cli` to build it.
|
||||
|
||||
After building, run: `./llava-cli` to see the usage. For example:
|
||||
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||
./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
||||
--image path/to/an/image.jpg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
||||
|
@ -62,7 +62,7 @@ python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
|
|||
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||
```sh
|
||||
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||
|
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
|||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llava-cli \
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
|
@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
|
|||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llava-cli \
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
|
@ -126,7 +126,7 @@ llama_print_timings: total time = 34570.79 ms
|
|||
#### llava-cli release-b2005
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llava-cli \
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
|
@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
|||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
./llava-cli \
|
||||
./llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
--image /data/local/tmp/demo.jpeg \
|
||||
|
@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
|
|||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
./llava-cli \
|
||||
./llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
||||
|
|
|
@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
|
|||
After API is confirmed, more models will be supported / uploaded.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llava-cli` to build it.
|
||||
Build with cmake or run `make llama-llava-cli` to build it.
|
||||
|
||||
After building, run: `./llava-cli` to see the usage. For example:
|
||||
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||
./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||
```
|
||||
|
||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||
|
@ -95,9 +95,9 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
|
|||
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||
```
|
||||
|
||||
7) And finally we can run the llava-cli using the 1.6 model version:
|
||||
7) And finally we can run the llava cli using the 1.6 model version:
|
||||
```console
|
||||
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||
./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||
```
|
||||
|
||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||
|
|
|
@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
|
|||
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
||||
|
||||
program_dir="build_64/bin"
|
||||
binName="llava-cli"
|
||||
binName="llama-llava-cli"
|
||||
n_threads=4
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET lookahead)
|
||||
set(TARGET llama-lookahead)
|
||||
add_executable(${TARGET} lookahead.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -1,22 +1,22 @@
|
|||
set(TARGET lookup)
|
||||
set(TARGET llama-lookup)
|
||||
add_executable(${TARGET} lookup.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TARGET lookup-create)
|
||||
set(TARGET llama-lookup-create)
|
||||
add_executable(${TARGET} lookup-create.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TARGET lookup-merge)
|
||||
set(TARGET llama-lookup-merge)
|
||||
add_executable(${TARGET} lookup-merge.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TARGET lookup-stats)
|
||||
set(TARGET llama-lookup-stats)
|
||||
add_executable(${TARGET} lookup-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -11,14 +11,14 @@
|
|||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage() {
|
||||
static void print_usage(char* argv0) {
|
||||
fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
|
||||
fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
|
||||
fprintf(stderr, "Usage: %s [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n", argv0);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
if (argc < 3) {
|
||||
print_usage();
|
||||
print_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
@ -27,7 +27,7 @@ int main(int argc, char ** argv){
|
|||
for (int i = 0; i < argc-1; ++i) {
|
||||
args[i] = argv[i+1];
|
||||
if (args[i] == "-h" || args[i] == "--help") {
|
||||
print_usage();
|
||||
print_usage(argv[0]);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
cmake_minimum_required(VERSION 3.12)
|
||||
project("main-cmake-pkg" C CXX)
|
||||
set(TARGET main-cmake-pkg)
|
||||
project("llama-cli-cmake-pkg" C CXX)
|
||||
set(TARGET llama-cli-cmake-pkg)
|
||||
|
||||
find_package(Llama 0.0.1 REQUIRED)
|
||||
|
||||
# Bake common functionality in with target. Because applications
|
||||
# using the relocatable Llama package should be outside of the
|
||||
# source tree, main-cmake-pkg pretends the dependencies are built-in.
|
||||
# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
|
||||
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
||||
add_library(common OBJECT)
|
||||
file(GLOB _common_files
|
||||
|
@ -15,7 +15,7 @@ file(GLOB _common_files
|
|||
)
|
||||
target_sources(common PRIVATE ${_common_files})
|
||||
|
||||
# If the common project was part of "main-cmake-pkg" the transient
|
||||
# If the common project was part of "llama-cli-cmake-pkg" the transient
|
||||
# defines would automatically be attached. Because the common func-
|
||||
# tionality is separate, but dependent upon the defines, it must be
|
||||
# explicitly extracted from the "llama" target.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# llama.cpp/example/main-cmake-pkg
|
||||
|
||||
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||
This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||
|
||||
## Building
|
||||
|
||||
|
@ -20,7 +20,7 @@ cmake --build build --config Release
|
|||
cmake --install build --prefix C:/LlamaCPP
|
||||
```
|
||||
|
||||
### Build main-cmake-pkg
|
||||
### Build llama-cli-cmake-pkg
|
||||
|
||||
|
||||
```cmd
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET main)
|
||||
set(TARGET llama-cli)
|
||||
add_executable(${TARGET} main.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue