merge master
This commit is contained in:
commit
3fc2a81bfa
477 changed files with 115691 additions and 43933 deletions
|
@ -15,7 +15,7 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
||||||
stage('Running llama.cpp'){
|
stage('Running llama.cpp'){
|
||||||
sh'''#!/bin/bash
|
sh'''#!/bin/bash
|
||||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||||
cat llama_log.txt # Printing results
|
cat llama_log.txt # Printing results
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
|
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
COPY requirements requirements
|
COPY requirements requirements
|
||||||
|
@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
|
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
|
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
|
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
COPY requirements requirements
|
COPY requirements requirements
|
||||||
|
@ -18,7 +18,7 @@ COPY . .
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
|
|
@ -23,10 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV LLAMA_CUDA=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -1,4 +1,4 @@
|
||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
@ -15,12 +15,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target main
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/main /main
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/main" ]
|
ENTRYPOINT [ "/app/llama-cli" ]
|
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
# Install build tools
|
# Install build tools
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
||||||
|
|
||||||
# Install Vulkan SDK
|
# Install Vulkan SDK
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
|
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
||||||
cmake --build build --config Release --target main
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/main /main && \
|
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -9,12 +9,15 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
|
COPY --from=build /app/llama-cli /llama-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
|
@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llamaclblast
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamaclblastserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llamaclblast
|
%{_bindir}/llama-clblast-cli
|
||||||
%{_bindir}/llamaclblastserver
|
%{_bindir}/llama-clblast-server
|
||||||
%{_bindir}/llamaclblastsimple
|
%{_bindir}/llama-clblast-simple
|
||||||
/usr/lib/systemd/system/llamaclblast.service
|
/usr/lib/systemd/system/llamaclblast.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llamacppcuda
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,9 +67,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llamacppcuda
|
%{_bindir}/llama-cuda-cli
|
||||||
%{_bindir}/llamacppcudaserver
|
%{_bindir}/llama-cuda-server
|
||||||
%{_bindir}/llamacppcudasimple
|
%{_bindir}/llama-cuda-simple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacuda.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -38,9 +38,9 @@ make -j
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llama
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||||
cp -p server %{buildroot}%{_bindir}/llamaserver
|
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamasimple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||||
|
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -69,9 +69,9 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama
|
%{_bindir}/llama-cli
|
||||||
%{_bindir}/llamaserver
|
%{_bindir}/llama-server
|
||||||
%{_bindir}/llamasimple
|
%{_bindir}/llama-simple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
|
|
|
@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -1,4 +1,4 @@
|
||||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
@ -15,15 +15,15 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/server /server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/server" ]
|
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -19,13 +19,13 @@ RUN apt-get update && \
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
cmake --build build --config Release --target server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/server /server && \
|
RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -11,15 +11,15 @@ COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -6,11 +6,11 @@
|
||||||
let
|
let
|
||||||
inherit (config.packages) default;
|
inherit (config.packages) default;
|
||||||
binaries = [
|
binaries = [
|
||||||
"llama"
|
"llama-cli"
|
||||||
"llama-embedding"
|
"llama-embedding"
|
||||||
"llama-server"
|
"llama-server"
|
||||||
"quantize"
|
"llama-quantize"
|
||||||
"train-text-from-scratch"
|
"llama-train-text-from-scratch"
|
||||||
];
|
];
|
||||||
mkApp = name: {
|
mkApp = name: {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
|
|
@ -243,8 +243,6 @@ effectiveStdenv.mkDerivation (
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
|
|
||||||
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
|
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/llama.h $out/include/
|
cp $src/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
@ -294,7 +292,7 @@ effectiveStdenv.mkDerivation (
|
||||||
license = lib.licenses.mit;
|
license = lib.licenses.mit;
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
mainProgram = "llama";
|
mainProgram = "llama-cli";
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
|
|
|
@ -8,13 +8,13 @@ arg1="$1"
|
||||||
shift
|
shift
|
||||||
|
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
python3 ./convert.py "$@"
|
python3 ./convert-hf-to-gguf.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./quantize "$@"
|
./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./main "$@"
|
./llama-cli "$@"
|
||||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||||
./finetune "$@"
|
./llama-finetune "$@"
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
./server "$@"
|
./llama-server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
|
|
|
@ -12,8 +12,8 @@ build*/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
/main
|
/llama-cli
|
||||||
/quantize
|
/llama-quantize
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
|
@ -26,3 +26,6 @@ indent_size = 2
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
|
[examples/cvector-generator/*.txt]
|
||||||
|
insert_final_newline = unset
|
||||||
|
|
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Low Severity Bugs
|
||||||
|
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "low severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Medium Severity Bug
|
||||||
|
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "medium severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: High Severity Bug
|
||||||
|
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "high severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Critical Severity Bug
|
||||||
|
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||||
|
title: "Bug: "
|
||||||
|
labels: ["bug-unconfirmed", "critical severity"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Thanks for taking the time to fill out this bug report!
|
||||||
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
|
and the version of llama.cpp that you are using.
|
||||||
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
- type: textarea
|
||||||
|
id: what-happened
|
||||||
|
attributes:
|
||||||
|
label: What happened?
|
||||||
|
description: Also tell us, what did you expect to happen?
|
||||||
|
placeholder: Tell us what you see!
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
id: version
|
||||||
|
attributes:
|
||||||
|
label: Name and Version
|
||||||
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
|
placeholder: |
|
||||||
|
$./llama-cli --version
|
||||||
|
version: 2999 (42b4109e)
|
||||||
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: dropdown
|
||||||
|
id: operating-system
|
||||||
|
attributes:
|
||||||
|
label: What operating system are you seeing the problem on?
|
||||||
|
multiple: true
|
||||||
|
options:
|
||||||
|
- Linux
|
||||||
|
- Mac
|
||||||
|
- Windows
|
||||||
|
- BSD
|
||||||
|
- Other? (Please let us know in description)
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
name: Enhancement
|
||||||
|
description: Used to request enhancements for llama.cpp
|
||||||
|
title: "Feature Request: "
|
||||||
|
labels: ["enhancement"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: prerequisites
|
||||||
|
attributes:
|
||||||
|
label: Prerequisites
|
||||||
|
description: Please confirm the following before submitting your enhancement request.
|
||||||
|
options:
|
||||||
|
- label: I am running the latest code. Mention the version if possible as well.
|
||||||
|
required: true
|
||||||
|
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||||
|
required: true
|
||||||
|
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||||
|
required: true
|
||||||
|
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: feature-description
|
||||||
|
attributes:
|
||||||
|
label: Feature Description
|
||||||
|
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||||
|
placeholder: Detailed description of the enhancement
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: motivation
|
||||||
|
attributes:
|
||||||
|
label: Motivation
|
||||||
|
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||||
|
placeholder: Explanation of why this feature is needed and its benefits
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: possible-implementation
|
||||||
|
attributes:
|
||||||
|
label: Possible Implementation
|
||||||
|
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||||
|
placeholder: Detailed description of potential implementation
|
||||||
|
validations:
|
||||||
|
required: false
|
52
.github/ISSUE_TEMPLATE/06-research.yml
vendored
Normal file
52
.github/ISSUE_TEMPLATE/06-research.yml
vendored
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
name: Research
|
||||||
|
description: Track new technical research area
|
||||||
|
title: "Research: "
|
||||||
|
labels: ["research 🔬"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: research-stage
|
||||||
|
attributes:
|
||||||
|
label: Research Stage
|
||||||
|
description: Track general state of this research ticket
|
||||||
|
options:
|
||||||
|
- label: Background Research (Let's try to avoid reinventing the wheel)
|
||||||
|
- label: Hypothesis Formed (How do you think this will work and it's effect?)
|
||||||
|
- label: Strategy / Implementation Forming
|
||||||
|
- label: Analysis of results
|
||||||
|
- label: Debrief / Documentation (So people in the future can learn from us)
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: background
|
||||||
|
attributes:
|
||||||
|
label: Previous existing literature and research
|
||||||
|
description: Whats the current state of the art and whats the motivation for this research?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: hypothesis
|
||||||
|
attributes:
|
||||||
|
label: Hypothesis
|
||||||
|
description: How do you think this will work and it's effect?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: implementation
|
||||||
|
attributes:
|
||||||
|
label: Implementation
|
||||||
|
description: Got an approach? e.g. a PR ready to go?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: analysis
|
||||||
|
attributes:
|
||||||
|
label: Analysis
|
||||||
|
description: How does the proposed implementation behave?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
name: Refactor (Maintainers)
|
||||||
|
description: Used to track refactoring opportunities
|
||||||
|
title: "Refactor: "
|
||||||
|
labels: ["refactor"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||||
|
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: background-description
|
||||||
|
attributes:
|
||||||
|
label: Background Description
|
||||||
|
description: Please provide a detailed written description of the pain points you are trying to solve.
|
||||||
|
placeholder: Detailed description behind your motivation to request refactor
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: possible-approaches
|
||||||
|
attributes:
|
||||||
|
label: Possible Refactor Approaches
|
||||||
|
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
|
||||||
|
placeholder: Your idea of possible refactoring opportunity/approaches
|
||||||
|
validations:
|
||||||
|
required: false
|
11
.github/ISSUE_TEMPLATE/bug.md
vendored
11
.github/ISSUE_TEMPLATE/bug.md
vendored
|
@ -1,11 +0,0 @@
|
||||||
---
|
|
||||||
name: Bug template
|
|
||||||
about: Used to report bugs in llama.cpp
|
|
||||||
labels: ["bug-unconfirmed"]
|
|
||||||
assignees: ''
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
|
||||||
|
|
||||||
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
|
13
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
13
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
blank_issues_enabled: true
|
||||||
|
contact_links:
|
||||||
|
- name: Got an idea?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
||||||
|
about: Pop it there. It may then become an enhancement ticket.
|
||||||
|
- name: Got a question?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
||||||
|
about: Ask a question there!
|
||||||
|
- name: Want to contribute?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
||||||
|
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||||
|
|
||||||
|
|
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
|
@ -1,28 +0,0 @@
|
||||||
---
|
|
||||||
name: Enhancement template
|
|
||||||
about: Used to request enhancements for llama.cpp
|
|
||||||
labels: ["enhancement"]
|
|
||||||
assignees: ''
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Prerequisites
|
|
||||||
|
|
||||||
Please answer the following questions for yourself before submitting an issue.
|
|
||||||
|
|
||||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
|
||||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
|
||||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
|
||||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
|
||||||
|
|
||||||
# Feature Description
|
|
||||||
|
|
||||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
|
||||||
|
|
||||||
# Motivation
|
|
||||||
|
|
||||||
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
|
||||||
|
|
||||||
# Possible Implementation
|
|
||||||
|
|
||||||
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
|
14
.github/labeler.yml
vendored
14
.github/labeler.yml
vendored
|
@ -1,5 +1,16 @@
|
||||||
# https://github.com/actions/labeler
|
# https://github.com/actions/labeler
|
||||||
|
Kompute:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml-kompute.h
|
||||||
|
- ggml-kompute.cpp
|
||||||
|
- README-kompute.md
|
||||||
|
Apple Metal:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml-metal.h
|
||||||
|
- ggml-metal.cpp
|
||||||
|
- README-metal.md
|
||||||
SYCL:
|
SYCL:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
@ -9,6 +20,7 @@ SYCL:
|
||||||
Nvidia GPU:
|
Nvidia GPU:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
- ggml-cuda.h
|
||||||
- ggml-cuda/**
|
- ggml-cuda/**
|
||||||
Vulkan:
|
Vulkan:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
|
|
7
.github/pull_request_template.md
vendored
Normal file
7
.github/pull_request_template.md
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
|
||||||
|
|
||||||
|
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
||||||
|
- Self-reported review complexity:
|
||||||
|
- [ ] Low
|
||||||
|
- [ ] Medium
|
||||||
|
- [ ] High
|
2
.github/workflows/bench.yml
vendored
2
.github/workflows/bench.yml
vendored
|
@ -119,7 +119,7 @@ jobs:
|
||||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
-DLLAMA_FATAL_WARNINGS=OFF \
|
||||||
-DLLAMA_ALL_WARNINGS=OFF \
|
-DLLAMA_ALL_WARNINGS=OFF \
|
||||||
-DCMAKE_BUILD_TYPE=Release;
|
-DCMAKE_BUILD_TYPE=Release;
|
||||||
cmake --build build --config Release -j $(nproc) --target server
|
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Download the dataset
|
- name: Download the dataset
|
||||||
id: download_dataset
|
id: download_dataset
|
||||||
|
|
67
.github/workflows/build.yml
vendored
67
.github/workflows/build.yml
vendored
|
@ -13,7 +13,7 @@ on:
|
||||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
@ -84,7 +84,7 @@ jobs:
|
||||||
name: llama-bin-macos-arm64.zip
|
name: llama-bin-macos-arm64.zip
|
||||||
|
|
||||||
macOS-latest-cmake-x64:
|
macOS-latest-cmake-x64:
|
||||||
runs-on: macos-latest
|
runs-on: macos-12
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -103,12 +103,10 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
|
@ -241,8 +239,8 @@ jobs:
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||||
echo "Fetch llama2c model"
|
echo "Fetch llama2c model"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||||
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||||
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -294,12 +292,22 @@ jobs:
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Build (no OpenMP)
|
||||||
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
|
||||||
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
|
@ -674,12 +682,10 @@ jobs:
|
||||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OPENBLAS_VERSION: 0.3.23
|
OPENBLAS_VERSION: 0.3.23
|
||||||
OPENCL_VERSION: 2023.04.17
|
|
||||||
CLBLAST_VERSION: 1.6.0
|
|
||||||
SDE_VERSION: 9.33.0-2024-01-07
|
SDE_VERSION: 9.33.0-2024-01-07
|
||||||
VULKAN_VERSION: 1.3.261.1
|
VULKAN_VERSION: 1.3.261.1
|
||||||
|
|
||||||
|
@ -696,8 +702,6 @@ jobs:
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512-x64'
|
- build: 'avx512-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'clblast-x64'
|
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute-x64'
|
- build: 'kompute-x64'
|
||||||
|
@ -722,27 +726,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
git submodule update --init kompute
|
git submodule update --init kompute
|
||||||
|
|
||||||
- name: Download OpenCL SDK
|
|
||||||
id: get_opencl
|
|
||||||
if: ${{ matrix.build == 'clblast-x64' }}
|
|
||||||
run: |
|
|
||||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
|
||||||
mkdir $env:RUNNER_TEMP/opencl
|
|
||||||
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
|
|
||||||
|
|
||||||
- name: Download CLBlast
|
|
||||||
id: get_clblast
|
|
||||||
if: ${{ matrix.build == 'clblast-x64' }}
|
|
||||||
run: |
|
|
||||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
|
||||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
|
||||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
|
|
||||||
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
|
|
||||||
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
|
|
||||||
$txt = Get-Content -Path $f -Raw
|
|
||||||
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
|
|
||||||
}
|
|
||||||
|
|
||||||
- name: Download OpenBLAS
|
- name: Download OpenBLAS
|
||||||
id: get_openblas
|
id: get_openblas
|
||||||
if: ${{ matrix.build == 'openblas-x64' }}
|
if: ${{ matrix.build == 'openblas-x64' }}
|
||||||
|
@ -776,13 +759,6 @@ jobs:
|
||||||
cmake -S . -B build ${{ matrix.defines }}
|
cmake -S . -B build ${{ matrix.defines }}
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
- name: Add clblast.dll
|
|
||||||
id: add_clblast_dll
|
|
||||||
if: ${{ matrix.build == 'clblast-x64' }}
|
|
||||||
run: |
|
|
||||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
|
||||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
|
||||||
|
|
||||||
- name: Add libopenblas.dll
|
- name: Add libopenblas.dll
|
||||||
id: add_libopenblas_dll
|
id: add_libopenblas_dll
|
||||||
if: ${{ matrix.build == 'openblas-x64' }}
|
if: ${{ matrix.build == 'openblas-x64' }}
|
||||||
|
@ -806,7 +782,7 @@ jobs:
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
# not all machines have native AVX-512
|
# not all machines have native AVX-512
|
||||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main -C Release --verbose --timeout 900
|
ctest -L main -C Release --verbose --timeout 900
|
||||||
|
@ -851,7 +827,7 @@ jobs:
|
||||||
name: llama-bin-win-${{ matrix.build }}.zip
|
name: llama-bin-win-${{ matrix.build }}.zip
|
||||||
|
|
||||||
windows-latest-cmake-cuda:
|
windows-latest-cmake-cuda:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
@ -865,8 +841,9 @@ jobs:
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- uses: Jimver/cuda-toolkit@v0.2.11
|
- name: Install CUDA toolkit
|
||||||
id: cuda-toolkit
|
id: cuda-toolkit
|
||||||
|
uses: Jimver/cuda-toolkit@v0.2.15
|
||||||
with:
|
with:
|
||||||
cuda: ${{ matrix.cuda }}
|
cuda: ${{ matrix.cuda }}
|
||||||
method: 'network'
|
method: 'network'
|
||||||
|
@ -1061,7 +1038,7 @@ jobs:
|
||||||
# hypervisor: 'qemu'
|
# hypervisor: 'qemu'
|
||||||
# run: |
|
# run: |
|
||||||
# sudo pkg update
|
# sudo pkg update
|
||||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
|
||||||
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
||||||
|
|
||||||
release:
|
release:
|
||||||
|
|
17
.github/workflows/docker.yml
vendored
17
.github/workflows/docker.yml
vendored
|
@ -30,21 +30,20 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
|
||||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||||
# have disabled them for now until the reason why
|
# have disabled them for now until the reason why
|
||||||
# is understood.
|
# is understood.
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
# TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
#- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
#- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
10
.github/workflows/server.yml
vendored
10
.github/workflows/server.yml
vendored
|
@ -16,11 +16,9 @@ on:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
pull_request_target:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
schedule:
|
|
||||||
- cron: '2 4 * * *'
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
|
@ -98,7 +96,7 @@ jobs:
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
|
@ -115,7 +113,7 @@ jobs:
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
runs-on: windows-latest
|
runs-on: windows-2019
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -138,7 +136,7 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
id: setup_python
|
id: setup_python
|
||||||
|
|
46
.gitignore
vendored
46
.gitignore
vendored
|
@ -34,9 +34,11 @@ ggml-metal-embed.metal
|
||||||
lcov-report/
|
lcov-report/
|
||||||
gcovr-report/
|
gcovr-report/
|
||||||
|
|
||||||
|
tags
|
||||||
build*
|
build*
|
||||||
!build.zig
|
!build.zig
|
||||||
cmake-build-*
|
cmake-build-*
|
||||||
|
android-ndk-*
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
@ -44,48 +46,9 @@ models/*
|
||||||
models-mnt
|
models-mnt
|
||||||
|
|
||||||
/Pipfile
|
/Pipfile
|
||||||
/baby-llama
|
|
||||||
/beam-search
|
|
||||||
/benchmark-matmult
|
|
||||||
/convert-llama2c-to-ggml
|
|
||||||
/embd-input-test
|
|
||||||
/embedding
|
|
||||||
/eval-callback
|
|
||||||
/gguf
|
|
||||||
/gguf-llama-simple
|
|
||||||
/gguf-split
|
|
||||||
/gritlm
|
|
||||||
/imatrix
|
|
||||||
/infill
|
|
||||||
/libllama.so
|
/libllama.so
|
||||||
/llama-bench
|
/llama-*
|
||||||
/llava-cli
|
llama-batched-swift
|
||||||
/lookahead
|
|
||||||
/lookup
|
|
||||||
/lookup-create
|
|
||||||
/lookup-merge
|
|
||||||
/lookup-stats
|
|
||||||
/main
|
|
||||||
/metal
|
|
||||||
/passkey
|
|
||||||
/perplexity
|
|
||||||
/q8dot
|
|
||||||
/quantize
|
|
||||||
/quantize-stats
|
|
||||||
/result
|
|
||||||
/save-load-state
|
|
||||||
/server
|
|
||||||
/simple
|
|
||||||
/batched
|
|
||||||
/batched-bench
|
|
||||||
/export-lora
|
|
||||||
/finetune
|
|
||||||
/retrieval
|
|
||||||
/speculative
|
|
||||||
/parallel
|
|
||||||
/train-text-from-scratch
|
|
||||||
/tokenize
|
|
||||||
/vdot
|
|
||||||
/common/build-info.cpp
|
/common/build-info.cpp
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
@ -105,6 +68,7 @@ examples/jeopardy/results.txt
|
||||||
examples/server/*.html.hpp
|
examples/server/*.html.hpp
|
||||||
examples/server/*.js.hpp
|
examples/server/*.js.hpp
|
||||||
examples/server/*.mjs.hpp
|
examples/server/*.mjs.hpp
|
||||||
|
examples/server/*.css.hpp
|
||||||
|
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
|
161
CMakeLists.txt
161
CMakeLists.txt
|
@ -39,8 +39,12 @@ endif()
|
||||||
|
|
||||||
if (APPLE)
|
if (APPLE)
|
||||||
set(LLAMA_METAL_DEFAULT ON)
|
set(LLAMA_METAL_DEFAULT ON)
|
||||||
|
set(LLAMA_BLAS_DEFAULT ON)
|
||||||
|
set(LLAMA_BLAS_VENDOR_DEFAULT "Apple")
|
||||||
else()
|
else()
|
||||||
set(LLAMA_METAL_DEFAULT OFF)
|
set(LLAMA_METAL_DEFAULT OFF)
|
||||||
|
set(LLAMA_BLAS_DEFAULT OFF)
|
||||||
|
set(LLAMA_BLAS_VENDOR_DEFAULT "Generic")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
||||||
|
@ -72,6 +76,7 @@ else()
|
||||||
set(INS_ENB ON)
|
set(INS_ENB ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
option(LLAMA_SVE "llama: enable SVE" OFF)
|
||||||
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||||
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||||
|
@ -90,9 +95,10 @@ endif()
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT})
|
||||||
|
set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||||
|
"llama: BLAS library vendor")
|
||||||
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
|
||||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
|
@ -105,14 +111,15 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||||
"llama: max. batch size for using peer access")
|
"llama: max. batch size for using peer access")
|
||||||
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
||||||
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
|
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
|
||||||
|
option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF)
|
||||||
|
|
||||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
|
||||||
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||||
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
|
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
|
||||||
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
|
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
|
||||||
|
option(LLAMA_VULKAN_MEMORY_DEBUG "llama: enable Vulkan memory debug output" OFF)
|
||||||
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
|
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
|
||||||
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
|
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||||
|
@ -124,6 +131,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||||
option(LLAMA_RPC "llama: use RPC" OFF)
|
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||||
|
option(LLAMA_OPENMP "llama: use OpenMP" ON)
|
||||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||||
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
|
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
|
||||||
|
@ -294,13 +302,24 @@ if (LLAMA_METAL)
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_OPENMP)
|
||||||
|
find_package(OpenMP)
|
||||||
|
if (OpenMP_FOUND)
|
||||||
|
message(STATUS "OpenMP found")
|
||||||
|
add_compile_definitions(GGML_USE_OPENMP)
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||||
|
else()
|
||||||
|
message(WARNING "OpenMP not found")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BLAS)
|
if (LLAMA_BLAS)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
set(BLA_STATIC ON)
|
set(BLA_STATIC ON)
|
||||||
endif()
|
endif()
|
||||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
||||||
set(BLA_SIZEOF_INTEGER 8)
|
# set(BLA_SIZEOF_INTEGER 8)
|
||||||
endif()
|
#endif()
|
||||||
|
|
||||||
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
||||||
find_package(BLAS)
|
find_package(BLAS)
|
||||||
|
@ -308,7 +327,7 @@ if (LLAMA_BLAS)
|
||||||
if (BLAS_FOUND)
|
if (BLAS_FOUND)
|
||||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||||
|
|
||||||
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
|
if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple"))
|
||||||
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
||||||
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||||
find_package(PkgConfig REQUIRED)
|
find_package(PkgConfig REQUIRED)
|
||||||
|
@ -361,12 +380,15 @@ if (LLAMA_BLAS)
|
||||||
|
|
||||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
add_compile_definitions(GGML_USE_BLAS)
|
||||||
|
|
||||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
||||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(GGML_HEADERS_BLAS ggml-blas.h)
|
||||||
|
set(GGML_SOURCES_BLAS ggml-blas.cpp)
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||||
else()
|
else()
|
||||||
|
@ -389,18 +411,36 @@ if (LLAMA_CUBLAS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUDA)
|
if (LLAMA_CUDA)
|
||||||
cmake_minimum_required(VERSION 3.17)
|
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||||
|
|
||||||
find_package(CUDAToolkit)
|
find_package(CUDAToolkit)
|
||||||
if (CUDAToolkit_FOUND)
|
if (CUDAToolkit_FOUND)
|
||||||
message(STATUS "CUDA found")
|
message(STATUS "CUDA found")
|
||||||
|
|
||||||
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
|
# 52 == lowest CUDA 12 standard
|
||||||
|
# 60 == f16 CUDA intrinsics
|
||||||
|
# 61 == integer CUDA intrinsics
|
||||||
|
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||||
|
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
|
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
||||||
|
|
||||||
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
|
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
|
||||||
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUDA)
|
add_compile_definitions(GGML_USE_CUDA)
|
||||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
||||||
|
@ -426,6 +466,18 @@ if (LLAMA_CUDA)
|
||||||
if (LLAMA_CUDA_NO_PEER_COPY)
|
if (LLAMA_CUDA_NO_PEER_COPY)
|
||||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_FA_ALL_QUANTS)
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||||
|
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||||
|
else()
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||||
|
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||||
|
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
|
||||||
|
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
|
@ -443,21 +495,6 @@ if (LLAMA_CUDA)
|
||||||
else()
|
else()
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
|
||||||
# 52 == lowest CUDA 12 standard
|
|
||||||
# 60 == f16 CUDA intrinsics
|
|
||||||
# 61 == integer CUDA intrinsics
|
|
||||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
|
||||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
|
||||||
else()
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
|
||||||
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
|
||||||
|
|
||||||
else()
|
else()
|
||||||
message(WARNING "CUDA not found")
|
message(WARNING "CUDA not found")
|
||||||
endif()
|
endif()
|
||||||
|
@ -474,22 +511,6 @@ if (LLAMA_RPC)
|
||||||
set(GGML_SOURCES_RPC ggml-rpc.cpp)
|
set(GGML_SOURCES_RPC ggml-rpc.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CLBLAST)
|
|
||||||
find_package(CLBlast)
|
|
||||||
if (CLBlast_FOUND)
|
|
||||||
message(STATUS "CLBlast found")
|
|
||||||
|
|
||||||
set(GGML_HEADERS_OPENCL ggml-opencl.h)
|
|
||||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
|
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CLBLAST)
|
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
|
|
||||||
else()
|
|
||||||
message(WARNING "CLBlast not found")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_VULKAN)
|
if (LLAMA_VULKAN)
|
||||||
find_package(Vulkan)
|
find_package(Vulkan)
|
||||||
if (Vulkan_FOUND)
|
if (Vulkan_FOUND)
|
||||||
|
@ -514,6 +535,10 @@ if (LLAMA_VULKAN)
|
||||||
add_compile_definitions(GGML_VULKAN_DEBUG)
|
add_compile_definitions(GGML_VULKAN_DEBUG)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_VULKAN_MEMORY_DEBUG)
|
||||||
|
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_VULKAN_VALIDATE)
|
if (LLAMA_VULKAN_VALIDATE)
|
||||||
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
||||||
endif()
|
endif()
|
||||||
|
@ -529,12 +554,17 @@ if (LLAMA_VULKAN)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_HIPBLAS)
|
if (LLAMA_HIPBLAS)
|
||||||
if ($ENV{ROCM_PATH})
|
if (NOT EXISTS $ENV{ROCM_PATH})
|
||||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
if (NOT EXISTS /opt/rocm)
|
||||||
|
set(ROCM_PATH /usr)
|
||||||
|
else()
|
||||||
|
set(ROCM_PATH /opt/rocm)
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
set(ROCM_PATH /opt/rocm)
|
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||||
endif()
|
endif()
|
||||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
||||||
|
|
||||||
# CMake on Windows doesn't support the HIP language yet
|
# CMake on Windows doesn't support the HIP language yet
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
|
@ -570,6 +600,10 @@ if (LLAMA_HIPBLAS)
|
||||||
|
|
||||||
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
|
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
|
||||||
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
|
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
|
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
|
||||||
|
|
||||||
|
@ -589,6 +623,19 @@ if (LLAMA_HIPBLAS)
|
||||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUDA_FA_ALL_QUANTS)
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||||
|
else()
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
endif()
|
||||||
|
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
@ -627,6 +674,10 @@ if (LLAMA_SYCL)
|
||||||
add_compile_definitions(GGML_SYCL_F16)
|
add_compile_definitions(GGML_SYCL_F16)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
|
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_compile_options(-I./) #include DPCT
|
add_compile_options(-I./) #include DPCT
|
||||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||||
|
|
||||||
|
@ -638,7 +689,8 @@ if (LLAMA_SYCL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(GGML_HEADERS_SYCL ggml-sycl.h)
|
set(GGML_HEADERS_SYCL ggml-sycl.h)
|
||||||
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
|
||||||
|
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
||||||
|
@ -742,6 +794,7 @@ if (LLAMA_KOMPUTE)
|
||||||
kompute-shaders/op_mul_mat_q4_0.comp
|
kompute-shaders/op_mul_mat_q4_0.comp
|
||||||
kompute-shaders/op_mul_mat_q4_1.comp
|
kompute-shaders/op_mul_mat_q4_1.comp
|
||||||
kompute-shaders/op_mul_mat_q6_k.comp
|
kompute-shaders/op_mul_mat_q6_k.comp
|
||||||
|
kompute-shaders/op_getrows_f32.comp
|
||||||
kompute-shaders/op_getrows_f16.comp
|
kompute-shaders/op_getrows_f16.comp
|
||||||
kompute-shaders/op_getrows_q4_0.comp
|
kompute-shaders/op_getrows_q4_0.comp
|
||||||
kompute-shaders/op_getrows_q4_1.comp
|
kompute-shaders/op_getrows_q4_1.comp
|
||||||
|
@ -774,6 +827,7 @@ if (LLAMA_KOMPUTE)
|
||||||
shaderop_mul_mat_q4_0.h
|
shaderop_mul_mat_q4_0.h
|
||||||
shaderop_mul_mat_q4_1.h
|
shaderop_mul_mat_q4_1.h
|
||||||
shaderop_mul_mat_q6_k.h
|
shaderop_mul_mat_q6_k.h
|
||||||
|
shaderop_getrows_f32.h
|
||||||
shaderop_getrows_f16.h
|
shaderop_getrows_f16.h
|
||||||
shaderop_getrows_q4_0.h
|
shaderop_getrows_q4_0.h
|
||||||
shaderop_getrows_q4_1.h
|
shaderop_getrows_q4_1.h
|
||||||
|
@ -1040,6 +1094,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_SVE)
|
||||||
|
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||||
|
@ -1208,7 +1265,6 @@ add_library(ggml OBJECT
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
||||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
|
@ -1216,6 +1272,7 @@ add_library(ggml OBJECT
|
||||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||||
|
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
|
||||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1296,8 +1353,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||||
|
|
||||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
"${GGML_HEADERS_CUDA}"
|
||||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
|
"${GGML_HEADERS_METAL}"
|
||||||
|
"${GGML_HEADERS_EXTRA}")
|
||||||
|
|
||||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
install(TARGETS ggml PUBLIC_HEADER)
|
install(TARGETS ggml PUBLIC_HEADER)
|
||||||
|
@ -1306,7 +1364,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||||
|
|
||||||
install(
|
install(
|
||||||
FILES convert.py
|
FILES convert-hf-to-gguf.py
|
||||||
PERMISSIONS
|
PERMISSIONS
|
||||||
OWNER_READ
|
OWNER_READ
|
||||||
OWNER_WRITE
|
OWNER_WRITE
|
||||||
|
@ -1333,6 +1391,13 @@ if (LLAMA_METAL)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
configure_file(cmake/llama.pc.in
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||||
|
@ONLY)
|
||||||
|
|
||||||
|
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||||
|
DESTINATION lib/pkgconfig)
|
||||||
|
|
||||||
#
|
#
|
||||||
# programs, examples and tests
|
# programs, examples and tests
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
{
|
{
|
||||||
"version": 4,
|
"version": 4,
|
||||||
"configurePresets": [
|
"configurePresets": [
|
||||||
{
|
{
|
||||||
|
@ -40,6 +40,10 @@
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
||||||
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
14
CONTRIBUTING.md
Normal file
14
CONTRIBUTING.md
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# Contributing Guidelines
|
||||||
|
|
||||||
|
## Checklist
|
||||||
|
|
||||||
|
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
|
||||||
|
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||||
|
* Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
|
|
||||||
|
## PR formatting
|
||||||
|
|
||||||
|
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||||
|
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
|
||||||
|
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
|
||||||
|
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
|
263
Makefile
263
Makefile
|
@ -1,8 +1,45 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
libllava.a \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
llama-baby-llama \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
llama-batched \
|
||||||
|
llama-batched-bench \
|
||||||
|
llama-bench \
|
||||||
|
llama-benchmark-matmult \
|
||||||
|
llama-cli \
|
||||||
|
llama-convert-llama2c-to-ggml \
|
||||||
|
llama-embedding \
|
||||||
|
llama-eval-callback \
|
||||||
|
llama-export-lora \
|
||||||
|
llama-finetune \
|
||||||
|
llama-gbnf-validator \
|
||||||
|
llama-gguf \
|
||||||
|
llama-gguf-split \
|
||||||
|
llama-gritlm \
|
||||||
|
llama-imatrix \
|
||||||
|
llama-infill \
|
||||||
|
llama-llava-cli \
|
||||||
|
llama-lookahead \
|
||||||
|
llama-lookup \
|
||||||
|
llama-lookup-create \
|
||||||
|
llama-lookup-merge \
|
||||||
|
llama-lookup-stats \
|
||||||
|
llama-parallel \
|
||||||
|
llama-passkey \
|
||||||
|
llama-perplexity \
|
||||||
|
llama-q8dot \
|
||||||
|
llama-quantize \
|
||||||
|
llama-quantize-stats \
|
||||||
|
llama-retrieval \
|
||||||
|
llama-save-load-state \
|
||||||
|
llama-server \
|
||||||
|
llama-simple \
|
||||||
|
llama-speculative \
|
||||||
|
llama-tokenize \
|
||||||
|
llama-train-text-from-scratch \
|
||||||
|
llama-vdot \
|
||||||
|
llama-cvector-generator \
|
||||||
|
tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
|
@ -57,6 +94,8 @@ ifeq ($(UNAME_S),Darwin)
|
||||||
LLAMA_METAL := 1
|
LLAMA_METAL := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
LLAMA_NO_OPENMP := 1
|
||||||
|
|
||||||
ifneq ($(UNAME_P),arm)
|
ifneq ($(UNAME_P),arm)
|
||||||
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
||||||
ifeq ($(SYSCTL_M),1)
|
ifeq ($(SYSCTL_M),1)
|
||||||
|
@ -67,6 +106,10 @@ ifeq ($(UNAME_S),Darwin)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_RPC
|
||||||
|
BUILD_TARGETS += rpc-server
|
||||||
|
endif
|
||||||
|
|
||||||
default: $(BUILD_TARGETS)
|
default: $(BUILD_TARGETS)
|
||||||
|
|
||||||
test: $(TEST_TARGETS)
|
test: $(TEST_TARGETS)
|
||||||
|
@ -135,12 +178,16 @@ MK_NVCCFLAGS = -std=c++11
|
||||||
ifdef LLAMA_FAST
|
ifdef LLAMA_FAST
|
||||||
MK_CFLAGS += -Ofast
|
MK_CFLAGS += -Ofast
|
||||||
HOST_CXXFLAGS += -Ofast
|
HOST_CXXFLAGS += -Ofast
|
||||||
|
ifndef LLAMA_DEBUG
|
||||||
MK_NVCCFLAGS += -O3
|
MK_NVCCFLAGS += -O3
|
||||||
|
endif # LLAMA_DEBUG
|
||||||
else
|
else
|
||||||
MK_CFLAGS += -O3
|
MK_CFLAGS += -O3
|
||||||
MK_CXXFLAGS += -O3
|
MK_CXXFLAGS += -O3
|
||||||
|
ifndef LLAMA_DEBUG
|
||||||
MK_NVCCFLAGS += -O3
|
MK_NVCCFLAGS += -O3
|
||||||
endif
|
endif # LLAMA_DEBUG
|
||||||
|
endif # LLAMA_FAST
|
||||||
|
|
||||||
ifndef LLAMA_NO_CCACHE
|
ifndef LLAMA_NO_CCACHE
|
||||||
CCACHE := $(shell which ccache)
|
CCACHE := $(shell which ccache)
|
||||||
|
@ -201,9 +248,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
MK_CFLAGS += -O0 -g
|
MK_CFLAGS += -O0 -g
|
||||||
MK_CXXFLAGS += -O0 -g
|
MK_CXXFLAGS += -O0 -g
|
||||||
MK_LDFLAGS += -g
|
MK_LDFLAGS += -g
|
||||||
|
MK_NVCCFLAGS += -O0 -g
|
||||||
|
|
||||||
ifeq ($(UNAME_S),Linux)
|
ifeq ($(UNAME_S),Linux)
|
||||||
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
||||||
|
@ -393,34 +441,65 @@ ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac OS - include Accelerate framework.
|
# Mac OS - include Accelerate framework.
|
||||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
|
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
|
||||||
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
||||||
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
||||||
MK_LDFLAGS += -framework Accelerate
|
MK_LDFLAGS += -framework Accelerate
|
||||||
|
OBJS += ggml-blas.o
|
||||||
endif
|
endif
|
||||||
endif # LLAMA_NO_ACCELERATE
|
endif # LLAMA_NO_ACCELERATE
|
||||||
|
|
||||||
|
ifndef LLAMA_NO_OPENMP
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
||||||
|
MK_CFLAGS += -fopenmp
|
||||||
|
MK_CXXFLAGS += -fopenmp
|
||||||
|
endif # LLAMA_NO_OPENMP
|
||||||
|
|
||||||
ifdef LLAMA_OPENBLAS
|
ifdef LLAMA_OPENBLAS
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
|
||||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
||||||
|
OBJS += ggml-blas.o
|
||||||
endif # LLAMA_OPENBLAS
|
endif # LLAMA_OPENBLAS
|
||||||
|
|
||||||
|
ifdef LLAMA_OPENBLAS64
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
|
||||||
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
|
||||||
|
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
|
||||||
|
OBJS += ggml-blas.o
|
||||||
|
endif # LLAMA_OPENBLAS64
|
||||||
|
|
||||||
|
ifdef LLAMA_BLIS
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||||
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||||
|
OBJS += ggml-blas.o
|
||||||
|
endif # LLAMA_BLIS
|
||||||
|
|
||||||
ifndef LLAMA_NO_LLAMAFILE
|
ifndef LLAMA_NO_LLAMAFILE
|
||||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||||
OBJS += sgemm.o
|
OBJS += sgemm.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_BLIS
|
ifdef LLAMA_RPC
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
OBJS += ggml-rpc.o
|
||||||
endif # LLAMA_BLIS
|
endif # LLAMA_RPC
|
||||||
|
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
||||||
LLAMA_CUDA := 1
|
LLAMA_CUDA := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
|
||||||
|
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
|
||||||
|
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
||||||
|
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
|
||||||
|
else
|
||||||
|
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
|
||||||
|
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
|
||||||
|
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
|
||||||
|
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
||||||
|
|
||||||
ifdef LLAMA_CUDA
|
ifdef LLAMA_CUDA
|
||||||
ifneq ('', '$(wildcard /opt/cuda)')
|
ifneq ('', '$(wildcard /opt/cuda)')
|
||||||
CUDA_PATH ?= /opt/cuda
|
CUDA_PATH ?= /opt/cuda
|
||||||
|
@ -431,6 +510,7 @@ ifdef LLAMA_CUDA
|
||||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
||||||
|
OBJS += $(OBJS_CUDA_TEMP_INST)
|
||||||
MK_NVCCFLAGS += -use_fast_math
|
MK_NVCCFLAGS += -use_fast_math
|
||||||
ifdef LLAMA_FATAL_WARNINGS
|
ifdef LLAMA_FATAL_WARNINGS
|
||||||
MK_NVCCFLAGS += -Werror all-warnings
|
MK_NVCCFLAGS += -Werror all-warnings
|
||||||
|
@ -441,6 +521,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
MK_NVCCFLAGS += -lineinfo
|
MK_NVCCFLAGS += -lineinfo
|
||||||
endif # LLAMA_DEBUG
|
endif # LLAMA_DEBUG
|
||||||
|
ifdef LLAMA_CUDA_DEBUG
|
||||||
|
MK_NVCCFLAGS += --device-debug
|
||||||
|
endif # LLAMA_CUDA_DEBUG
|
||||||
ifdef LLAMA_CUDA_NVCC
|
ifdef LLAMA_CUDA_NVCC
|
||||||
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
||||||
else
|
else
|
||||||
|
@ -490,7 +573,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
||||||
endif # LLAMA_CUDA_NO_PEER_COPY
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
||||||
ifdef LLAMA_CUDA_CCBIN
|
ifdef LLAMA_CUDA_CCBIN
|
||||||
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||||
endif
|
endif # LLAMA_CUDA_CCBIN
|
||||||
|
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
||||||
|
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
|
||||||
|
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
||||||
|
|
||||||
ifdef JETSON_EOL_MODULE_DETECT
|
ifdef JETSON_EOL_MODULE_DETECT
|
||||||
define NVCC_COMPILE
|
define NVCC_COMPILE
|
||||||
|
@ -502,30 +588,13 @@ define NVCC_COMPILE
|
||||||
endef # NVCC_COMPILE
|
endef # NVCC_COMPILE
|
||||||
endif # JETSON_EOL_MODULE_DETECT
|
endif # JETSON_EOL_MODULE_DETECT
|
||||||
|
|
||||||
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
||||||
$(NVCC_COMPILE)
|
$(NVCC_COMPILE)
|
||||||
|
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
||||||
$(NVCC_COMPILE)
|
$(NVCC_COMPILE)
|
||||||
endif # LLAMA_CUDA
|
endif # LLAMA_CUDA
|
||||||
|
|
||||||
ifdef LLAMA_CLBLAST
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
|
||||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
|
||||||
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
|
||||||
|
|
||||||
# Mac provides OpenCL as a framework
|
|
||||||
ifeq ($(UNAME_S),Darwin)
|
|
||||||
MK_LDFLAGS += -lclblast -framework OpenCL
|
|
||||||
else
|
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
|
||||||
endif
|
|
||||||
OBJS += ggml-opencl.o
|
|
||||||
|
|
||||||
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
endif # LLAMA_CLBLAST
|
|
||||||
|
|
||||||
ifdef LLAMA_VULKAN
|
ifdef LLAMA_VULKAN
|
||||||
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
||||||
MK_LDFLAGS += -lvulkan
|
MK_LDFLAGS += -lvulkan
|
||||||
|
@ -539,6 +608,10 @@ ifdef LLAMA_VULKAN_DEBUG
|
||||||
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
|
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_VULKAN_MEMORY_DEBUG
|
||||||
|
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_VULKAN_VALIDATE
|
ifdef LLAMA_VULKAN_VALIDATE
|
||||||
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
||||||
endif
|
endif
|
||||||
|
@ -568,6 +641,7 @@ ifdef LLAMA_HIP_UMA
|
||||||
MK_CPPFLAGS += -DGGML_HIP_UMA
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
||||||
endif # LLAMA_HIP_UMA
|
endif # LLAMA_HIP_UMA
|
||||||
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||||
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
||||||
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||||
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
||||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
|
@ -581,11 +655,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
||||||
endif # LLAMA_CUDA_NO_PEER_COPY
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
||||||
|
OBJS += $(OBJS_CUDA_TEMP_INST)
|
||||||
|
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
||||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||||
|
|
||||||
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
|
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
||||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||||
|
|
||||||
endif # LLAMA_HIPBLAS
|
endif # LLAMA_HIPBLAS
|
||||||
|
@ -623,11 +698,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
||||||
endif
|
endif
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
|
||||||
|
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
||||||
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
||||||
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
||||||
|
|
||||||
ifndef LLAMA_NO_LLAMAFILE
|
ifndef LLAMA_NO_LLAMAFILE
|
||||||
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_RPC
|
||||||
|
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
endif # LLAMA_RPC
|
||||||
|
|
||||||
GF_CC := $(CC)
|
GF_CC := $(CC)
|
||||||
include scripts/get-flags.mk
|
include scripts/get-flags.mk
|
||||||
|
|
||||||
|
@ -701,20 +791,18 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
ggml-blas.o: ggml-blas.cpp ggml-blas.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
unicode.o: unicode.cpp unicode.h
|
unicode.o: unicode.cpp unicode.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
unicode-data.o: unicode-data.cpp unicode-data.h
|
unicode-data.o: unicode-data.cpp unicode-data.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
||||||
|
|
||||||
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
|
||||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
|
||||||
|
|
||||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
@ -743,8 +831,9 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||||
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||||
rm -vrf ggml-cuda/*.o
|
rm -vrf ggml-cuda/*.o
|
||||||
|
rm -vrf ggml-cuda/template-instances/*.o
|
||||||
find examples pocs -type f -name "*.o" -delete
|
find examples pocs -type f -name "*.o" -delete
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -758,62 +847,62 @@ clean:
|
||||||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||||
|
|
||||||
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./llama-cli -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
||||||
|
@ -826,23 +915,27 @@ examples/server/%.hpp: examples/server/public/% Makefile
|
||||||
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
||||||
) > $@
|
) > $@
|
||||||
|
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -853,59 +946,61 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||||
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||||
|
|
||||||
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
|
||||||
|
|
||||||
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -931,20 +1026,20 @@ build-info.o: common/build-info.cpp
|
||||||
|
|
||||||
tests: $(TEST_TARGETS)
|
tests: $(TEST_TARGETS)
|
||||||
|
|
||||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
run-benchmark-matmult: benchmark-matmult
|
run-benchmark-matmult: llama-benchmark-matmult
|
||||||
./$@
|
./$@
|
||||||
|
|
||||||
.PHONY: run-benchmark-matmult swift
|
.PHONY: run-benchmark-matmult swift
|
||||||
|
|
||||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# llama.cpp for SYCL
|
# llama.cpp for SYCL
|
||||||
|
|
||||||
- [Background](#background)
|
- [Background](#background)
|
||||||
|
- [Recommended Release](#recommended-release)
|
||||||
- [News](#news)
|
- [News](#news)
|
||||||
- [OS](#os)
|
- [OS](#os)
|
||||||
- [Hardware](#hardware)
|
- [Hardware](#hardware)
|
||||||
|
@ -29,10 +30,25 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
|
||||||
|
|
||||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
||||||
|
|
||||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||||
|
|
||||||
|
## Recommended Release
|
||||||
|
|
||||||
|
The SYCL backend would be broken by some PRs due to no online CI.
|
||||||
|
|
||||||
|
The following release is verified with good quality:
|
||||||
|
|
||||||
|
|Commit ID|Tag|Release|Verified Platform|
|
||||||
|
|-|-|-|-|
|
||||||
|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
||||||
|
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
- 2024.5
|
||||||
|
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||||
|
- Arch Linux is verified successfully.
|
||||||
|
|
||||||
- 2024.4
|
- 2024.4
|
||||||
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
|
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
|
||||||
|
|
||||||
|
@ -54,10 +70,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
|
|
||||||
## OS
|
## OS
|
||||||
|
|
||||||
| OS | Status | Verified |
|
| OS | Status | Verified |
|
||||||
|---------|---------|------------------------------------|
|
|---------|---------|------------------------------------------------|
|
||||||
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
|
||||||
| Windows | Support | Windows 11 |
|
| Windows | Support | Windows 11 |
|
||||||
|
|
||||||
|
|
||||||
## Hardware
|
## Hardware
|
||||||
|
@ -70,14 +86,14 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
|-------------------------------|---------|---------------------------------------|
|
|-------------------------------|---------|---------------------------------------|
|
||||||
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
||||||
| Intel Data Center Flex Series | Support | Flex 170 |
|
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||||
| Intel Arc Series | Support | Arc 770, 730M |
|
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
|
||||||
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
||||||
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
|
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- **Memory**
|
- **Memory**
|
||||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
|
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||||
|
|
||||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||||
|
|
||||||
|
@ -99,14 +115,14 @@ The docker build option is currently limited to *intel GPU* targets.
|
||||||
### Build image
|
### Build image
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
||||||
|
|
||||||
You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
|
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||||
|
|
||||||
### Run container
|
### Run container
|
||||||
|
|
||||||
|
@ -275,7 +291,7 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./build/bin/ls-sycl-device
|
./build/bin/llama-ls-sycl-device
|
||||||
```
|
```
|
||||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
||||||
```
|
```
|
||||||
|
@ -313,7 +329,7 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||||
```
|
```
|
||||||
or run by script:
|
or run by script:
|
||||||
|
|
||||||
|
@ -324,7 +340,7 @@ or run by script:
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
Otherwise, you can run the script:
|
Otherwise, you can run the script:
|
||||||
|
@ -427,7 +443,7 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
|
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
|
@ -488,13 +504,13 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||||
```
|
```
|
||||||
Otherwise, run the following wrapper script:
|
Otherwise, run the following wrapper script:
|
||||||
|
|
||||||
|
|
276
README.md
276
README.md
|
@ -2,12 +2,17 @@
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT) [](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||||
|
[](https://conan.io/center/llama-cpp)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||||
|
|
||||||
### Recent API changes
|
### Recent API changes
|
||||||
|
|
||||||
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
||||||
|
@ -20,7 +25,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
|
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
|
||||||
|
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||||
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
||||||
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
||||||
|
@ -50,7 +56,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
<li><a href="#quantization">Quantization</a></li>
|
<li><a href="#quantization">Quantization</a></li>
|
||||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||||
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
|
||||||
<li><a href="#instruct-mode">Instruct mode</a></li>
|
|
||||||
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
|
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
|
||||||
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
|
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
|
||||||
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
|
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
|
||||||
|
@ -74,7 +79,7 @@ variety of hardware - locally and in the cloud.
|
||||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
||||||
- Vulkan, SYCL, and (partial) OpenCL backend support
|
- Vulkan and SYCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
||||||
|
@ -147,6 +152,8 @@ Typically finetunes of the base models below are supported as well.
|
||||||
|
|
||||||
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||||
|
|
||||||
|
[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||||
|
@ -188,6 +195,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||||
|
- [RAGNA Desktop](https://ragna.app/) (proprietary)
|
||||||
- [RecurseChat](https://recurse.chat/) (proprietary)
|
- [RecurseChat](https://recurse.chat/) (proprietary)
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
|
@ -200,15 +208,21 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
|
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
|
**Tools:**
|
||||||
|
|
||||||
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||||
I llama.cpp build info:
|
I llama.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
|
@ -311,8 +325,6 @@ In order to build llama.cpp you have four different options.
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
|
|
||||||
|
|
||||||
- On Windows:
|
- On Windows:
|
||||||
|
|
||||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||||
|
@ -324,40 +336,38 @@ In order to build llama.cpp you have four different options.
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Notes:
|
||||||
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
||||||
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
|
- For debug builds, run `make LLAMA_DEBUG=1`
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build
|
cmake -B build
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: for `Debug` builds, there are two cases:
|
**Notes**:
|
||||||
|
|
||||||
- Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||||
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
|
- For debug builds, there are two cases:
|
||||||
|
|
||||||
|
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DCMAKE_BUILD_TYPE=Debug
|
cmake -B build -DCMAKE_BUILD_TYPE=Debug
|
||||||
cmake --build build
|
cmake --build build
|
||||||
```
|
```
|
||||||
|
|
||||||
- Multi-config generators (`-G` param set to Visual Studio, XCode...):
|
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -G "Xcode"
|
cmake -B build -G "Xcode"
|
||||||
cmake --build build --config Debug
|
cmake --build build --config Debug
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `Zig` (version 0.11 or later):
|
|
||||||
|
|
||||||
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
|
|
||||||
it's also possible to cross compile for other operating systems and architectures:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
|
|
||||||
```
|
|
||||||
|
|
||||||
The `zig targets` command will give you valid options to use.
|
|
||||||
|
|
||||||
- Using `gmake` (FreeBSD):
|
- Using `gmake` (FreeBSD):
|
||||||
|
|
||||||
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
|
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
|
||||||
|
@ -365,15 +375,42 @@ In order to build llama.cpp you have four different options.
|
||||||
3. Install compilation dependencies.
|
3. Install compilation dependencies.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
|
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
|
||||||
opencl clblast openblas
|
|
||||||
|
|
||||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
||||||
```
|
```
|
||||||
|
|
||||||
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
|
### Homebrew
|
||||||
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
|
|
||||||
the instructions for use and activate this options in this document below.
|
On Mac and Linux, the homebrew package manager can be used via
|
||||||
|
```
|
||||||
|
brew install llama.cpp
|
||||||
|
```
|
||||||
|
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
|
||||||
|
|
||||||
|
### Nix
|
||||||
|
|
||||||
|
On Mac and Linux, the Nix package manager can be used via
|
||||||
|
```
|
||||||
|
nix profile install nixpkgs#llama-cpp
|
||||||
|
```
|
||||||
|
For flake enabled installs.
|
||||||
|
|
||||||
|
Or
|
||||||
|
```
|
||||||
|
nix-env --file '<nixpkgs>' --install --attr llama-cpp
|
||||||
|
```
|
||||||
|
For non-flake enabled installs.
|
||||||
|
|
||||||
|
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
||||||
|
|
||||||
|
#### Flox
|
||||||
|
|
||||||
|
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
|
||||||
|
```
|
||||||
|
flox install llama-cpp
|
||||||
|
```
|
||||||
|
Flox follows the nixpkgs build of llama.cpp.
|
||||||
|
|
||||||
### Metal Build
|
### Metal Build
|
||||||
|
|
||||||
|
@ -385,7 +422,7 @@ argument.
|
||||||
|
|
||||||
### BLAS Build
|
### BLAS Build
|
||||||
|
|
||||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
|
||||||
|
|
||||||
- #### Accelerate Framework:
|
- #### Accelerate Framework:
|
||||||
|
|
||||||
|
@ -473,10 +510,12 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
|
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
|
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
||||||
|
|
||||||
- #### hipBLAS
|
- #### hipBLAS
|
||||||
|
|
||||||
|
@ -537,111 +576,6 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### CLBlast
|
|
||||||
|
|
||||||
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
|
||||||
|
|
||||||
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
|
|
||||||
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
|
|
||||||
|
|
||||||
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Installing the OpenCL SDK from source</summary>
|
|
||||||
|
|
||||||
```sh
|
|
||||||
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
|
|
||||||
cd OpenCL-SDK
|
|
||||||
cmake -B build -DBUILD_DOCS=OFF \
|
|
||||||
-DBUILD_EXAMPLES=OFF \
|
|
||||||
-DBUILD_TESTING=OFF \
|
|
||||||
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
|
|
||||||
-DOPENCL_SDK_TEST_SAMPLES=OFF
|
|
||||||
cmake --build build
|
|
||||||
cmake --install build --prefix /some/path
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
##### Installing CLBlast
|
|
||||||
|
|
||||||
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
|
|
||||||
|
|
||||||
Linux packaging:
|
|
||||||
Fedora Linux:
|
|
||||||
```bash
|
|
||||||
sudo dnf install clblast
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively, they may be built from source.
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Windows:</summary>
|
|
||||||
|
|
||||||
```cmd
|
|
||||||
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
|
|
||||||
git clone https://github.com/CNugteren/CLBlast.git
|
|
||||||
cd CLBlast
|
|
||||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
|
|
||||||
cmake --build build --config Release
|
|
||||||
cmake --install build --prefix C:/CLBlast
|
|
||||||
```
|
|
||||||
|
|
||||||
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Unix:</summary>
|
|
||||||
|
|
||||||
```sh
|
|
||||||
git clone https://github.com/CNugteren/CLBlast.git
|
|
||||||
cd CLBlast
|
|
||||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
|
||||||
cmake --build build --config Release
|
|
||||||
cmake --install build --prefix /some/path
|
|
||||||
```
|
|
||||||
|
|
||||||
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
|
|
||||||
</details>
|
|
||||||
|
|
||||||
##### Building Llama with CLBlast
|
|
||||||
|
|
||||||
- Build with make:
|
|
||||||
```sh
|
|
||||||
make LLAMA_CLBLAST=1
|
|
||||||
```
|
|
||||||
- CMake (Unix):
|
|
||||||
```sh
|
|
||||||
cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
|
|
||||||
cmake --build build --config Release
|
|
||||||
```
|
|
||||||
- CMake (Windows):
|
|
||||||
```cmd
|
|
||||||
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
|
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
|
||||||
cd llama.cpp
|
|
||||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
|
|
||||||
cmake --build build --config Release
|
|
||||||
cmake --install build --prefix C:/LlamaCPP
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Running Llama with CLBlast
|
|
||||||
|
|
||||||
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
|
|
||||||
|
|
||||||
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
|
|
||||||
The selection can be a number (starting from 0) or a text string to search:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
GGML_OPENCL_PLATFORM=1 ./main ...
|
|
||||||
GGML_OPENCL_DEVICE=2 ./main ...
|
|
||||||
GGML_OPENCL_PLATFORM=Intel ./main ...
|
|
||||||
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
|
|
||||||
```
|
|
||||||
|
|
||||||
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
|
|
||||||
Using the variables it is possible to select a CPU-based driver as well, if so desired.
|
|
||||||
|
|
||||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
|
||||||
|
|
||||||
- #### Vulkan
|
- #### Vulkan
|
||||||
|
|
||||||
**With docker**:
|
**With docker**:
|
||||||
|
@ -650,7 +584,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Build the image
|
# Build the image
|
||||||
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
|
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
|
||||||
|
|
||||||
# Then, use it:
|
# Then, use it:
|
||||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
|
@ -671,7 +605,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
vulkaninfo
|
vulkaninfo
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
Alternatively your package manager might be able to provide the appropriate libraries.
|
||||||
|
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||||
|
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||||
|
|
||||||
Then, build llama.cpp using the cmake command below:
|
Then, build llama.cpp using the cmake command below:
|
||||||
|
|
||||||
|
@ -679,7 +615,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
cmake -B build -DLLAMA_VULKAN=1
|
cmake -B build -DLLAMA_VULKAN=1
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||||
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||||
|
|
||||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||||
|
@ -692,7 +628,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||||
|
|
||||||
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
||||||
|
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# obtain the official LLaMA model weights and place them in ./models
|
# obtain the official LLaMA model weights and place them in ./models
|
||||||
|
@ -709,23 +646,20 @@ ls ./models
|
||||||
python3 -m pip install -r requirements.txt
|
python3 -m pip install -r requirements.txt
|
||||||
|
|
||||||
# convert the model to ggml FP16 format
|
# convert the model to ggml FP16 format
|
||||||
python3 convert.py models/mymodel/
|
python3 convert-hf-to-gguf.py models/mymodel/
|
||||||
|
|
||||||
# [Optional] for models using BPE tokenizers
|
|
||||||
python convert.py models/mymodel/ --vocab-type bpe
|
|
||||||
|
|
||||||
# quantize the model to 4-bits (using Q4_K_M method)
|
# quantize the model to 4-bits (using Q4_K_M method)
|
||||||
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
|
|
||||||
# update the gguf filetype to current version if older version is now unsupported
|
# update the gguf filetype to current version if older version is now unsupported
|
||||||
./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
|
||||||
```
|
```
|
||||||
|
|
||||||
### Run the quantized model
|
### Run the quantized model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# start inference on a gguf model
|
# start inference on a gguf model
|
||||||
./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
@ -800,7 +734,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
|
||||||
#### How to run
|
#### How to run
|
||||||
|
|
||||||
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||||
3. Output:
|
3. Output:
|
||||||
```
|
```
|
||||||
perplexity : calculating perplexity over 655 chunks
|
perplexity : calculating perplexity over 655 chunks
|
||||||
|
@ -824,16 +758,16 @@ Here is an example of a few-shot interaction, invoked with the command
|
||||||
./examples/chat-13B.sh
|
./examples/chat-13B.sh
|
||||||
|
|
||||||
# custom arguments using a 13B model
|
# custom arguments using a 13B model
|
||||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Persistent Interaction
|
### Persistent Interaction
|
||||||
|
|
||||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start a new chat
|
# Start a new chat
|
||||||
|
@ -855,41 +789,13 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
```
|
```
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
|
||||||
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
||||||
|
|
||||||
### Instruct mode
|
|
||||||
|
|
||||||
1. First, download and place the `ggml` model into the `./models` folder
|
|
||||||
2. Run the `main` tool like this:
|
|
||||||
|
|
||||||
```
|
|
||||||
./examples/alpaca.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
Sample run:
|
|
||||||
|
|
||||||
```
|
|
||||||
== Running in interactive mode. ==
|
|
||||||
- Press Ctrl+C to interject at any time.
|
|
||||||
- Press Return to return control to LLaMA.
|
|
||||||
- If you want to submit another line, end your input in '\'.
|
|
||||||
|
|
||||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
|
||||||
|
|
||||||
> How many letters are there in the English alphabet?
|
|
||||||
There 26 letters in the English Alphabet
|
|
||||||
> What is the most common way of transportation in Amsterdam?
|
|
||||||
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
|
||||||
> List 5 words that start with "ca".
|
|
||||||
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
|
||||||
>
|
|
||||||
```
|
|
||||||
|
|
||||||
### Obtaining and using the Facebook LLaMA 2 model
|
### Obtaining and using the Facebook LLaMA 2 model
|
||||||
|
|
||||||
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
|
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
|
||||||
|
@ -962,7 +868,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
|
||||||
Now, you can start chatting:
|
Now, you can start chatting:
|
||||||
```
|
```
|
||||||
$cd /data/data/com.termux/files/home/bin
|
$cd /data/data/com.termux/files/home/bin
|
||||||
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's a demo of an interactive session running on Pixel 5 phone:
|
Here's a demo of an interactive session running on Pixel 5 phone:
|
||||||
|
@ -1029,8 +935,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
|
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
@ -1080,7 +986,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
|
||||||
|
|
||||||
### Docs
|
### Docs
|
||||||
|
|
||||||
- [main](./examples/main/README.md)
|
- [main (cli)](./examples/main/README.md)
|
||||||
- [server](./examples/server/README.md)
|
- [server](./examples/server/README.md)
|
||||||
- [jeopardy](./examples/jeopardy/README.md)
|
- [jeopardy](./examples/jeopardy/README.md)
|
||||||
- [BLIS](./docs/BLIS.md)
|
- [BLIS](./docs/BLIS.md)
|
||||||
|
|
226
ci/run.sh
226
ci/run.sh
|
@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -437,45 +437,45 @@ function gg_run_pythia_1_4b {
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -569,47 +569,47 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -693,10 +693,10 @@ function gg_run_embd_bge_small {
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
|
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||||
|
|
10
cmake/llama.pc.in
Normal file
10
cmake/llama.pc.in
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
prefix=@CMAKE_INSTALL_PREFIX@
|
||||||
|
exec_prefix=${prefix}
|
||||||
|
libdir=${exec_prefix}/lib
|
||||||
|
includedir=${prefix}/include
|
||||||
|
|
||||||
|
Name: llama
|
||||||
|
Description: Port of Facebook's LLaMA model in C/C++
|
||||||
|
Version: @PROJECT_VERSION@
|
||||||
|
Libs: -L${libdir} -lllama
|
||||||
|
Cflags: -I${includedir}
|
|
@ -84,4 +84,4 @@ endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
|
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
|
File diff suppressed because it is too large
Load diff
198
common/common.h
198
common/common.h
|
@ -56,66 +56,66 @@ struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = cpu_get_num_math();
|
int32_t n_threads = cpu_get_num_math();
|
||||||
int32_t n_threads_draft = -1;
|
int32_t n_threads_draft = -1;
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_threads_batch_draft = -1;
|
int32_t n_threads_batch_draft = -1;
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
int32_t grp_attn_n = 1; // group-attention factor
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t grp_attn_w = 512; // group-attention width
|
||||||
int32_t grp_attn_n = 1; // group-attention factor
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||||
int32_t grp_attn_w = 512; // group-attention width
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
|
||||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
||||||
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
|
||||||
std::string model = ""; // model path
|
std::string model = ""; // model path
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string model_url = ""; // model url to download
|
std::string model_url = ""; // model url to download
|
||||||
std::string hf_repo = ""; // HF repo
|
std::string hf_repo = ""; // HF repo
|
||||||
std::string hf_file = ""; // HF file
|
std::string hf_file = ""; // HF file
|
||||||
std::string prompt = "";
|
std::string prompt = "";
|
||||||
std::string prompt_file = ""; // store the external prompt file name
|
std::string prompt_file = ""; // store the external prompt file name
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with
|
std::string input_suffix = ""; // string to suffix user inputs with
|
||||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
std::string logdir = ""; // directory in which to save YAML log files
|
||||||
std::string logdir = ""; // directory in which to save YAML log files
|
|
||||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||||
std::string logits_file = ""; // file for saving *all* logits
|
std::string logits_file = ""; // file for saving *all* logits
|
||||||
|
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||||
|
|
||||||
|
std::vector<std::string> in_files; // all input files
|
||||||
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
std::vector<llama_model_kv_override> kv_overrides;
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
// TODO: avoid tuple, use struct
|
// TODO: avoid tuple, use struct
|
||||||
|
@ -124,30 +124,31 @@ struct gpt_params {
|
||||||
|
|
||||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
|
int32_t verbosity = 0;
|
||||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||||
|
|
||||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
// (which is more convenient to use for plotting)
|
// (which is more convenient to use for plotting)
|
||||||
//
|
//
|
||||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
|
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
|
||||||
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
||||||
|
|
||||||
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
|
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
|
||||||
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
|
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
|
||||||
|
|
||||||
bool kl_divergence = false; // compute KL divergence
|
bool kl_divergence = false; // compute KL divergence
|
||||||
|
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool usage = false; // print usage
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
|
bool special = false; // enable special token output
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
bool interactive_first = false; // wait for user input immediately
|
||||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
|
@ -155,8 +156,7 @@ struct gpt_params {
|
||||||
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute, 1=taxicab, 2=euclidean, >2=p-norm)
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
std::string embd_out = ""; // empty = default, "array" = [] or [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
std::string embd_out = ""; // empty = default, "array" = [] or [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
std::string embd_sep = "\n"; // separator of embendings
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
bool interactive_first = false; // wait for user input immediately
|
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
|
@ -164,7 +164,6 @@ struct gpt_params {
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool ignore_eos = false; // ignore generated EOS tokens
|
bool ignore_eos = false; // ignore generated EOS tokens
|
||||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
|
@ -182,6 +181,68 @@ struct gpt_params {
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// server params
|
||||||
|
int32_t port = 8080; // server listens on this network port
|
||||||
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
||||||
|
|
||||||
|
std::string hostname = "127.0.0.1";
|
||||||
|
std::string public_path = "";
|
||||||
|
std::string chat_template = "";
|
||||||
|
std::string system_prompt = "";
|
||||||
|
|
||||||
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
|
std::string ssl_file_key = "";
|
||||||
|
std::string ssl_file_cert = "";
|
||||||
|
|
||||||
|
bool endpoint_slots = true;
|
||||||
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
|
bool log_json = false;
|
||||||
|
|
||||||
|
std::string slot_save_path;
|
||||||
|
|
||||||
|
float slot_prompt_similarity = 0.5f;
|
||||||
|
|
||||||
|
// batched-bench params
|
||||||
|
bool is_pp_shared = false;
|
||||||
|
|
||||||
|
std::vector<int32_t> n_pp;
|
||||||
|
std::vector<int32_t> n_tg;
|
||||||
|
std::vector<int32_t> n_pl;
|
||||||
|
|
||||||
|
// retrieval params
|
||||||
|
std::vector<std::string> context_files; // context files to embed
|
||||||
|
|
||||||
|
int32_t chunk_size = 64; // chunk size for context embedding
|
||||||
|
|
||||||
|
std::string chunk_separator = "\n"; // chunk separator for context embedding
|
||||||
|
|
||||||
|
// passkey params
|
||||||
|
int32_t n_junk = 250; // number of times to repeat the junk text
|
||||||
|
int32_t i_pos = -1; // position of the passkey in the junk text
|
||||||
|
|
||||||
|
// imatrix params
|
||||||
|
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
||||||
|
|
||||||
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||||
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||||
|
int32_t i_chunk = 0; // start processing from this chunk
|
||||||
|
|
||||||
|
bool process_output = false; // collect data for the output tensor
|
||||||
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
||||||
|
// cvector-generator params
|
||||||
|
int n_completions = 64;
|
||||||
|
int n_pca_batch = 20;
|
||||||
|
int n_pca_iterations = 1000;
|
||||||
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
|
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
|
||||||
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
|
@ -201,7 +262,20 @@ std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
|
||||||
std::string string_strip(const std::string & str);
|
std::string string_strip(const std::string & str);
|
||||||
std::string string_get_sortable_timestamp();
|
std::string string_get_sortable_timestamp();
|
||||||
std::string string_random_prompt(std::mt19937 & rng);
|
|
||||||
|
template<class T>
|
||||||
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||||
|
std::vector<T> values;
|
||||||
|
std::istringstream str_stream(str);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(str_stream, token, delim)) {
|
||||||
|
T value;
|
||||||
|
std::istringstream token_stream(token);
|
||||||
|
token_stream >> value;
|
||||||
|
values.push_back(value);
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
void string_process_escapes(std::string & input);
|
void string_process_escapes(std::string & input);
|
||||||
|
@ -214,6 +288,7 @@ bool fs_validate_filename(const std::string & filename);
|
||||||
bool fs_create_directory_with_parents(const std::string & path);
|
bool fs_create_directory_with_parents(const std::string & path);
|
||||||
|
|
||||||
std::string fs_get_cache_directory();
|
std::string fs_get_cache_directory();
|
||||||
|
std::string fs_get_cache_file(const std::string & filename);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
@ -284,6 +359,13 @@ std::string llama_detokenize_bpe(
|
||||||
// defaults to true when model type is SPM, otherwise false.
|
// defaults to true when model type is SPM, otherwise false.
|
||||||
bool llama_should_add_bos_token(const llama_model * model);
|
bool llama_should_add_bos_token(const llama_model * model);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Chat template utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
|
bool llama_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -46,8 +46,12 @@ namespace grammar_parser {
|
||||||
state.rules[rule_id] = rule;
|
state.rules[rule_id] = rule;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_digit_char(char c) {
|
||||||
|
return '0' <= c && c <= '9';
|
||||||
|
}
|
||||||
|
|
||||||
static bool is_word_char(char c) {
|
static bool is_word_char(char c) {
|
||||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
|
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||||
|
@ -99,6 +103,17 @@ namespace grammar_parser {
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * parse_int(const char * src) {
|
||||||
|
const char * pos = src;
|
||||||
|
while (is_digit_char(*pos)) {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
if (pos == src) {
|
||||||
|
throw std::runtime_error(std::string("expecting integer at ") + src);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||||
if (*src == '\\') {
|
if (*src == '\\') {
|
||||||
switch (src[1]) {
|
switch (src[1]) {
|
||||||
|
@ -137,6 +152,60 @@ namespace grammar_parser {
|
||||||
bool is_nested) {
|
bool is_nested) {
|
||||||
size_t last_sym_start = out_elements.size();
|
size_t last_sym_start = out_elements.size();
|
||||||
const char * pos = src;
|
const char * pos = src;
|
||||||
|
|
||||||
|
auto handle_repetitions = [&](int min_times, int max_times) {
|
||||||
|
|
||||||
|
if (last_sym_start == out_elements.size()) {
|
||||||
|
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||||
|
// the following rewrite rules:
|
||||||
|
// S{m,n} --> S S S (m times) S'(n-m)
|
||||||
|
// S'(x) ::= S S'(x-1) |
|
||||||
|
// (... n-m definitions of these S' rules ...)
|
||||||
|
// S'(1) ::= S |
|
||||||
|
// S{m,} --> S S S (m times) S'
|
||||||
|
// S' ::= S S' |
|
||||||
|
// S* --> S{0,}
|
||||||
|
// --> S' ::= S S' |
|
||||||
|
// S+ --> S{1,}
|
||||||
|
// --> S S'
|
||||||
|
// S' ::= S S' |
|
||||||
|
// S? --> S{0,1}
|
||||||
|
// --> S'
|
||||||
|
// S' ::= S |
|
||||||
|
|
||||||
|
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
|
||||||
|
if (min_times == 0) {
|
||||||
|
out_elements.resize(last_sym_start);
|
||||||
|
} else {
|
||||||
|
// Repeat the previous elements (min_times - 1) times
|
||||||
|
for (int i = 1; i < min_times; i++) {
|
||||||
|
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t last_rec_rule_id = 0;
|
||||||
|
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
||||||
|
|
||||||
|
std::vector<llama_grammar_element> rec_rule(previous_elements);
|
||||||
|
for (int i = 0; i < n_opt; i++) {
|
||||||
|
rec_rule.resize(previous_elements.size());
|
||||||
|
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
|
||||||
|
if (i > 0 || max_times < 0) {
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
||||||
|
}
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||||
|
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||||
|
add_rule(state, rec_rule_id, rec_rule);
|
||||||
|
last_rec_rule_id = rec_rule_id;
|
||||||
|
}
|
||||||
|
if (n_opt > 0) {
|
||||||
|
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
while (*pos) {
|
while (*pos) {
|
||||||
if (*pos == '"') { // literal string
|
if (*pos == '"') { // literal string
|
||||||
pos++;
|
pos++;
|
||||||
|
@ -197,40 +266,51 @@ namespace grammar_parser {
|
||||||
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
||||||
}
|
}
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
|
} else if (*pos == '.') { // any char
|
||||||
if (last_sym_start == out_elements.size()) {
|
last_sym_start = out_elements.size();
|
||||||
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
|
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
||||||
}
|
|
||||||
|
|
||||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
|
||||||
// rewrite rules:
|
|
||||||
// S* --> S' ::= S S' |
|
|
||||||
// S+ --> S' ::= S S' | S
|
|
||||||
// S? --> S' ::= S |
|
|
||||||
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
|
||||||
std::vector<llama_grammar_element> sub_rule;
|
|
||||||
// add preceding symbol to generated rule
|
|
||||||
sub_rule.insert(
|
|
||||||
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
|
|
||||||
if (*pos == '*' || *pos == '+') {
|
|
||||||
// cause generated rule to recurse
|
|
||||||
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
|
||||||
}
|
|
||||||
// mark start of alternate def
|
|
||||||
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
|
||||||
if (*pos == '+') {
|
|
||||||
// add preceding symbol as alternate only for '+' (otherwise empty)
|
|
||||||
sub_rule.insert(
|
|
||||||
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
|
|
||||||
}
|
|
||||||
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
||||||
add_rule(state, sub_rule_id, sub_rule);
|
|
||||||
|
|
||||||
// in original rule, replace previous symbol with reference to generated rule
|
|
||||||
out_elements.resize(last_sym_start);
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
|
||||||
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == '*') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(0, -1);
|
||||||
|
} else if (*pos == '+') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(1, -1);
|
||||||
|
} else if (*pos == '?') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
handle_repetitions(0, 1);
|
||||||
|
} else if (*pos == '{') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
|
||||||
|
if (!is_digit_char(*pos)) {
|
||||||
|
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||||
|
}
|
||||||
|
const char * int_end = parse_int(pos);
|
||||||
|
int min_times = std::stoul(std::string(pos, int_end - pos));
|
||||||
|
pos = parse_space(int_end, is_nested);
|
||||||
|
|
||||||
|
int max_times = -1;
|
||||||
|
|
||||||
|
if (*pos == '}') {
|
||||||
|
max_times = min_times;
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == ',') {
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
|
||||||
|
if (is_digit_char(*pos)) {
|
||||||
|
const char * int_end = parse_int(pos);
|
||||||
|
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||||
|
pos = parse_space(int_end, is_nested);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*pos != '}') {
|
||||||
|
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
||||||
|
}
|
||||||
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
||||||
|
}
|
||||||
|
handle_repetitions(min_times, max_times);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -325,6 +405,7 @@ namespace grammar_parser {
|
||||||
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY: return true;
|
||||||
default: return false;
|
default: return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -339,6 +420,7 @@ namespace grammar_parser {
|
||||||
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
||||||
}
|
}
|
||||||
switch (elem.type) {
|
switch (elem.type) {
|
||||||
case LLAMA_GRETYPE_END:
|
case LLAMA_GRETYPE_END:
|
||||||
|
@ -350,6 +432,7 @@ namespace grammar_parser {
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
case LLAMA_GRETYPE_CHAR_NOT:
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
case LLAMA_GRETYPE_CHAR_ALT:
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
fprintf(file, "(\"");
|
fprintf(file, "(\"");
|
||||||
print_grammar_char(file, elem.value);
|
print_grammar_char(file, elem.value);
|
||||||
fprintf(file, "\") ");
|
fprintf(file, "\") ");
|
||||||
|
@ -407,11 +490,15 @@ namespace grammar_parser {
|
||||||
}
|
}
|
||||||
print_grammar_char(file, elem.value);
|
print_grammar_char(file, elem.value);
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
|
fprintf(file, ".");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (is_char_element(elem)) {
|
if (is_char_element(elem)) {
|
||||||
switch (rule[i + 1].type) {
|
switch (rule[i + 1].type) {
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
case LLAMA_GRETYPE_CHAR_ALT:
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
||||||
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
fprintf(file, "] ");
|
fprintf(file, "] ");
|
||||||
|
|
|
@ -16,92 +16,55 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
|
||||||
|
|
||||||
static std::string repeat(const std::string & str, size_t n);
|
static std::string repeat(const std::string & str, size_t n);
|
||||||
|
|
||||||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||||
|
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||||
|
|
||||||
|
if (min_items == 0 && max_items == 1) {
|
||||||
|
return item_rule + "?";
|
||||||
|
}
|
||||||
|
|
||||||
if (separator_rule.empty()) {
|
if (separator_rule.empty()) {
|
||||||
if (min_items == 0 && max_items == 1) {
|
if (min_items == 1 && !has_max) {
|
||||||
return item_rule + "?";
|
|
||||||
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
|
|
||||||
return item_rule + "+";
|
return item_rule + "+";
|
||||||
}
|
} else if (min_items == 0 && !has_max) {
|
||||||
}
|
return item_rule + "*";
|
||||||
|
|
||||||
std::string result;
|
|
||||||
if (min_items > 0) {
|
|
||||||
if (item_rule_is_literal && separator_rule.empty()) {
|
|
||||||
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
|
|
||||||
} else {
|
} else {
|
||||||
std::vector<std::string> items(min_items, item_rule);
|
return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
|
||||||
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
|
auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
|
||||||
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
|
if (min_items == 0) {
|
||||||
|
result = "(" + result + ")?";
|
||||||
if (up_to_n == 0) {
|
|
||||||
return "";
|
|
||||||
} else if (up_to_n == 1) {
|
|
||||||
return "(" + content + ")?";
|
|
||||||
} else if (!separator_rule.empty() && !prefix_with_sep) {
|
|
||||||
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
|
|
||||||
} else {
|
|
||||||
std::string res = repeat("(" + content + " ", up_to_n);
|
|
||||||
// strip trailing space
|
|
||||||
res = res.substr(0, res.length() - 1);
|
|
||||||
res += repeat(")?", up_to_n);
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if (min_items > 0 && max_items != min_items) {
|
|
||||||
result += " ";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (max_items != std::numeric_limits<int>::max()) {
|
|
||||||
result += opt_repetitions(max_items - min_items, min_items > 0);
|
|
||||||
} else {
|
|
||||||
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
|
|
||||||
if (min_items == 0 && !separator_rule.empty()) {
|
|
||||||
result = "(" + item_rule + " " + item_operator + "*)?";
|
|
||||||
} else {
|
|
||||||
result += item_operator + "*";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string SPACE_RULE = "\" \"?";
|
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||||
|
|
||||||
struct BuiltinRule {
|
struct BuiltinRule {
|
||||||
std::string content;
|
std::string content;
|
||||||
std::vector<std::string> deps;
|
std::vector<std::string> deps;
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
|
|
||||||
|
|
||||||
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
||||||
{"boolean", {"(\"true\" | \"false\") space", {}}},
|
{"boolean", {"(\"true\" | \"false\") space", {}}},
|
||||||
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
|
{"decimal-part", {"[0-9]{1,16}", {}}},
|
||||||
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
|
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
|
||||||
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
|
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
|
||||||
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
|
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
|
||||||
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
|
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
|
||||||
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
||||||
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
||||||
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
|
|
||||||
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
|
|
||||||
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
|
|
||||||
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
||||||
{"null", {"\"null\" space", {}}},
|
{"null", {"\"null\" space", {}}},
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
||||||
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
|
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
|
||||||
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
|
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
|
||||||
{"date-time", {"date \"T\" time", {"date", "time"}}},
|
{"date-time", {"date \"T\" time", {"date", "time"}}},
|
||||||
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
|
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
|
||||||
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
|
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
|
||||||
|
@ -385,8 +348,7 @@ private:
|
||||||
sub_is_literal ? "\"" + sub + "\"" : sub,
|
sub_is_literal ? "\"" + sub + "\"" : sub,
|
||||||
min_times,
|
min_times,
|
||||||
max_times,
|
max_times,
|
||||||
"",
|
""
|
||||||
sub_is_literal
|
|
||||||
);
|
);
|
||||||
seq.back().second = false;
|
seq.back().second = false;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
||||||
|
|
||||||
params.custom_n_ctx = false;
|
params.custom_n_ctx = false;
|
||||||
|
|
||||||
params.use_flash = true;
|
params.use_flash = false;
|
||||||
params.use_checkpointing = true;
|
params.use_checkpointing = true;
|
||||||
|
|
||||||
params.sample_start = "";
|
params.sample_start = "";
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# This script downloads the tokenizer models of the specified models from Huggingface and
|
# This script downloads the tokenizer models of the specified models from Huggingface and
|
||||||
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
||||||
|
@ -81,6 +82,9 @@ models = [
|
||||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
|
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||||
|
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
@ -25,8 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
from convert import LlamaHfVocab
|
|
||||||
|
|
||||||
logger = logging.getLogger("hf-to-gguf")
|
logger = logging.getLogger("hf-to-gguf")
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,11 +47,12 @@ class Model:
|
||||||
_model_classes: dict[str, type[Model]] = {}
|
_model_classes: dict[str, type[Model]] = {}
|
||||||
|
|
||||||
dir_model: Path
|
dir_model: Path
|
||||||
ftype: int
|
ftype: gguf.LlamaFileType
|
||||||
is_big_endian: bool
|
is_big_endian: bool
|
||||||
endianess: gguf.GGUFEndian
|
endianess: gguf.GGUFEndian
|
||||||
use_temp_file: bool
|
use_temp_file: bool
|
||||||
lazy: bool
|
lazy: bool
|
||||||
|
model_name: str | None
|
||||||
part_names: list[str]
|
part_names: list[str]
|
||||||
is_safetensors: bool
|
is_safetensors: bool
|
||||||
hparams: dict[str, Any]
|
hparams: dict[str, Any]
|
||||||
|
@ -65,7 +65,7 @@ class Model:
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
self.dir_model = dir_model
|
self.dir_model = dir_model
|
||||||
|
@ -74,10 +74,11 @@ class Model:
|
||||||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||||
self.use_temp_file = use_temp_file
|
self.use_temp_file = use_temp_file
|
||||||
self.lazy = not eager
|
self.lazy = not eager
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
self.model_name = model_name
|
||||||
|
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
|
||||||
self.is_safetensors = len(self.part_names) > 0
|
self.is_safetensors = len(self.part_names) > 0
|
||||||
if not self.is_safetensors:
|
if not self.is_safetensors:
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
|
@ -95,7 +96,7 @@ class Model:
|
||||||
ftype_lw: str = ftype_up.lower()
|
ftype_lw: str = ftype_up.lower()
|
||||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||||
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
|
@ -183,7 +184,7 @@ class Model:
|
||||||
return new_name
|
return new_name
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
|
||||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
||||||
|
@ -325,21 +326,21 @@ class Model:
|
||||||
|
|
||||||
def write(self):
|
def write(self):
|
||||||
self.write_tensors()
|
self.write_tensors()
|
||||||
self.gguf_writer.write_header_to_file()
|
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||||
self.gguf_writer.write_kv_data_to_file()
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
self.gguf_writer.write_tensors_to_file(progress=True)
|
self.gguf_writer.write_tensors_to_file(progress=True)
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
|
||||||
def write_vocab(self):
|
def write_vocab(self):
|
||||||
self.gguf_writer.write_header_to_file()
|
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||||
self.gguf_writer.write_kv_data_to_file()
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
|
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
||||||
part_names: list[str] = []
|
part_names: list[str] = []
|
||||||
for filename in os.listdir(dir_model):
|
for filename in os.listdir(dir_model):
|
||||||
if filename.endswith(suffix):
|
if filename.startswith(prefix) and filename.endswith(suffix):
|
||||||
part_names.append(filename)
|
part_names.append(filename)
|
||||||
|
|
||||||
part_names.sort()
|
part_names.sort()
|
||||||
|
@ -473,6 +474,15 @@ class Model:
|
||||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||||
res = "jina-v2-de"
|
res = "jina-v2-de"
|
||||||
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||||
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||||
|
res = "smaug-bpe"
|
||||||
|
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
||||||
|
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
||||||
|
res = "poro-chat"
|
||||||
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
||||||
|
res = "jina-v2-code"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -631,7 +641,7 @@ class Model:
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_llama_hf(self):
|
def _set_vocab_llama_hf(self):
|
||||||
vocab = LlamaHfVocab(self.dir_model)
|
vocab = gguf.LlamaHfVocab(self.dir_model)
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
|
@ -660,7 +670,7 @@ class GPTNeoXModel(Model):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
block_count = self.hparams["num_hidden_layers"]
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
@ -793,7 +803,7 @@ class MPTModel(Model):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
block_count = self.hparams["n_layers"]
|
block_count = self.hparams["n_layers"]
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
@ -845,7 +855,7 @@ class OrionModel(Model):
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
raise ValueError("gguf: can not find ctx length parameter.")
|
||||||
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
self.gguf_writer.add_context_length(ctx_length)
|
||||||
|
@ -882,7 +892,7 @@ class BaichuanModel(Model):
|
||||||
else:
|
else:
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
raise ValueError("gguf: can not find ctx length parameter.")
|
||||||
|
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
self.gguf_writer.add_context_length(ctx_length)
|
||||||
|
@ -1005,7 +1015,7 @@ class XverseModel(Model):
|
||||||
else:
|
else:
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
raise ValueError("gguf: can not find ctx length parameter.")
|
||||||
|
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
self.gguf_writer.add_context_length(ctx_length)
|
||||||
|
@ -1201,7 +1211,7 @@ class StableLMModel(Model):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
block_count = hparams["num_hidden_layers"]
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
@ -1314,6 +1324,17 @@ class LlamaModel(Model):
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||||
|
if tokenizer_config_file.is_file():
|
||||||
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_config_json = json.load(f)
|
||||||
|
if "add_prefix_space" in tokenizer_config_json:
|
||||||
|
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||||
|
|
||||||
|
# Apply to granite small models only
|
||||||
|
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
@ -1328,9 +1349,9 @@ class LlamaModel(Model):
|
||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
if name.endswith("q_proj.weight"):
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith("k_proj.weight"):
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
|
@ -1611,6 +1632,12 @@ class Qwen2MoeModel(Model):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
if (n_experts := self.hparams.get("num_experts")) is not None:
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||||
self.gguf_writer.add_expert_count(n_experts)
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
|
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||||
|
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
||||||
|
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||||
|
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||||
|
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
@ -1665,7 +1692,7 @@ class GPT2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GPT2
|
model_arch = gguf.MODEL_ARCH.GPT2
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||||
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||||
|
@ -2232,7 +2259,7 @@ class GemmaModel(Model):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
block_count = hparams["num_hidden_layers"]
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
@ -2332,7 +2359,7 @@ class MambaModel(Model):
|
||||||
# Fail early for models which don't have a block expansion factor of 2
|
# Fail early for models which don't have a block expansion factor of 2
|
||||||
assert d_inner == 2 * d_model
|
assert d_inner == 2 * d_model
|
||||||
|
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
||||||
self.gguf_writer.add_embedding_length(d_model)
|
self.gguf_writer.add_embedding_length(d_model)
|
||||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||||
|
@ -2392,7 +2419,8 @@ class CommandR2Model(Model):
|
||||||
|
|
||||||
# max_position_embeddings = 8192 in config.json but model was actually
|
# max_position_embeddings = 8192 in config.json but model was actually
|
||||||
# trained on 128k context length
|
# trained on 128k context length
|
||||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
# aya-23 models don't have model_max_length specified
|
||||||
|
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
@ -2438,11 +2466,13 @@ class JinaBertV2Model(BertModel):
|
||||||
|
|
||||||
def get_tensors(self):
|
def get_tensors(self):
|
||||||
for name, data in super().get_tensors():
|
for name, data in super().get_tensors():
|
||||||
if 'gated_layers' in name:
|
if 'gated_layer' in name:
|
||||||
d1 = data[:self.intermediate_size, :]
|
d1 = data[:self.intermediate_size, :]
|
||||||
name1 = name.replace('gated_layers', 'gated_layers_w')
|
name1 = name.replace('gated_layers', 'gated_layers_w')
|
||||||
|
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
||||||
d2 = data[self.intermediate_size:, :]
|
d2 = data[self.intermediate_size:, :]
|
||||||
name2 = name.replace('gated_layers', 'gated_layers_v')
|
name2 = name.replace('gated_layers', 'gated_layers_v')
|
||||||
|
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
||||||
yield name1, d1
|
yield name1, d1
|
||||||
yield name2, d2
|
yield name2, d2
|
||||||
continue
|
continue
|
||||||
|
@ -2616,6 +2646,85 @@ class ArcticModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("DeepseekV2ForCausalLM")
|
||||||
|
class DeepseekV2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
hparams = self.hparams
|
||||||
|
|
||||||
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||||
|
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||||
|
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||||
|
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||||
|
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||||
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
||||||
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("mlp.experts") != -1:
|
||||||
|
n_experts = self.hparams["n_routed_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
super().write_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -2748,8 +2857,13 @@ def main() -> None:
|
||||||
hparams = Model.load_hparams(dir_model)
|
hparams = Model.load_hparams(dir_model)
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
try:
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
except NotImplementedError:
|
||||||
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
|
||||||
|
|
||||||
logger.info("Set model parameters")
|
logger.info("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
|
|
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
|
||||||
### 1. Convert the model to GGUF
|
### 1. Convert the model to GGUF
|
||||||
|
|
||||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||||
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
|
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
|
||||||
|
|
||||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||||
|
|
||||||
|
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
## GGUF specification
|
## GGUF specification
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
## Verifying that the model is running on the GPU with CUDA
|
## Verifying that the model is running on the GPU with CUDA
|
||||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||||
```shell
|
```shell
|
||||||
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||||
```
|
```
|
||||||
|
|
||||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||||
|
@ -27,7 +27,7 @@ RAM: 32GB
|
||||||
|
|
||||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
||||||
|
|
||||||
Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||||
|
|
||||||
Result:
|
Result:
|
||||||
|
|
||||||
|
|
|
@ -12,44 +12,45 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
|
add_subdirectory(cvector-generator)
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
add_subdirectory(batched)
|
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(beam-search)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
add_subdirectory(export-lora)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gbnf-validator)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
add_subdirectory(gguf)
|
||||||
|
add_subdirectory(gritlm)
|
||||||
|
add_subdirectory(imatrix)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
if (LLAMA_SYCL)
|
|
||||||
add_subdirectory(sycl)
|
|
||||||
endif()
|
|
||||||
add_subdirectory(main)
|
|
||||||
add_subdirectory(tokenize)
|
|
||||||
add_subdirectory(parallel)
|
|
||||||
add_subdirectory(perplexity)
|
|
||||||
add_subdirectory(quantize)
|
|
||||||
add_subdirectory(quantize-stats)
|
|
||||||
add_subdirectory(retrieval)
|
|
||||||
add_subdirectory(save-load-state)
|
|
||||||
add_subdirectory(simple)
|
|
||||||
add_subdirectory(passkey)
|
|
||||||
add_subdirectory(speculative)
|
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(gguf)
|
add_subdirectory(main)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(passkey)
|
||||||
if (LLAMA_BUILD_SERVER)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(server)
|
add_subdirectory(quantize-stats)
|
||||||
endif()
|
add_subdirectory(quantize)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(retrieval)
|
||||||
if (LLAMA_RPC)
|
if (LLAMA_RPC)
|
||||||
add_subdirectory(rpc)
|
add_subdirectory(rpc)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_BUILD_SERVER)
|
||||||
|
add_subdirectory(server)
|
||||||
|
endif()
|
||||||
|
if (LLAMA_SYCL)
|
||||||
|
add_subdirectory(sycl)
|
||||||
|
endif()
|
||||||
|
add_subdirectory(save-load-state)
|
||||||
|
add_subdirectory(simple)
|
||||||
|
add_subdirectory(speculative)
|
||||||
|
add_subdirectory(tokenize)
|
||||||
|
add_subdirectory(train-text-from-scratch)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./llama-cli "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--in-prefix " " \
|
--in-prefix " " \
|
||||||
--in-suffix "${AI_NAME}:" \
|
--in-suffix "${AI_NAME}:" \
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#
|
|
||||||
# Temporary script - will be removed in the future
|
|
||||||
#
|
|
||||||
|
|
||||||
cd `dirname $0`
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
|
||||||
--color \
|
|
||||||
-f ./prompts/alpaca.txt \
|
|
||||||
--ctx_size 2048 \
|
|
||||||
-n -1 \
|
|
||||||
-ins -b 256 \
|
|
||||||
--top_k 10000 \
|
|
||||||
--temp 0.2 \
|
|
||||||
--repeat_penalty 1.1 \
|
|
||||||
-t 7
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET baby-llama)
|
set(TARGET llama-baby-llama)
|
||||||
add_executable(${TARGET} baby-llama.cpp)
|
add_executable(${TARGET} baby-llama.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
|
||||||
// wk shape [n_embd, n_embd, 1, 1]
|
// wk shape [n_embd, n_embd, 1, 1]
|
||||||
// Qcur shape [n_embd/n_head, n_head, N, 1]
|
// Qcur shape [n_embd/n_head, n_head, N, 1]
|
||||||
// Kcur shape [n_embd/n_head, n_head, N, 1]
|
// Kcur shape [n_embd/n_head, n_head, N, 1]
|
||||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
|
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
|
||||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
|
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
|
@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
|
||||||
// wk shape [n_embd, n_embd, 1, 1]
|
// wk shape [n_embd, n_embd, 1, 1]
|
||||||
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||||
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
|
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
|
||||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
|
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
|
||||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||||
|
|
||||||
|
@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
|
||||||
model->layers[il].wqb,
|
model->layers[il].wqb,
|
||||||
cur)),
|
cur)),
|
||||||
n_embd/n_head, n_head, N),
|
n_embd/n_head, n_head, N),
|
||||||
KQ_pos, n_rot, 0, 0);
|
KQ_pos, n_rot, 0);
|
||||||
struct ggml_tensor * Kcur = ggml_rope(ctx0,
|
struct ggml_tensor * Kcur = ggml_rope(ctx0,
|
||||||
ggml_reshape_3d(ctx0,
|
ggml_reshape_3d(ctx0,
|
||||||
ggml_mul_mat(ctx0,
|
ggml_mul_mat(ctx0,
|
||||||
|
@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
|
||||||
model->layers[il].wkb,
|
model->layers[il].wkb,
|
||||||
cur)),
|
cur)),
|
||||||
n_embd/n_head, n_head, N),
|
n_embd/n_head, n_head, N),
|
||||||
KQ_pos, n_rot, 0, 0);
|
KQ_pos, n_rot, 0);
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
|
|
|
@ -58,4 +58,4 @@ echo "$2
|
||||||
model=$1
|
model=$1
|
||||||
|
|
||||||
# generate the most likely continuation until the string "===" is found
|
# generate the most likely continuation until the string "===" is found
|
||||||
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched-bench)
|
set(TARGET llama-batched-bench)
|
||||||
add_executable(${TARGET} batched-bench.cpp)
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
||||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||||
|
|
||||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||||
|
|
||||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||||
|
|
||||||
# custom set of batches
|
# custom set of batches
|
||||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sample results
|
## Sample results
|
||||||
|
|
|
@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
|
LOG_TEE("\nexample usage:\n");
|
||||||
|
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||||
|
LOG_TEE("\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (argc == 1 || argv[1][0] == '-') {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
print_usage(argc, argv, params);
|
||||||
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
return 1;
|
||||||
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
|
||||||
return 1 ;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_kv_max = 2048;
|
int is_pp_shared = params.is_pp_shared;
|
||||||
int n_batch = 2048;
|
|
||||||
int n_ubatch = 512;
|
|
||||||
bool flash_attn = false;
|
|
||||||
int is_pp_shared = 0;
|
|
||||||
int n_gpu_layers = 0;
|
|
||||||
|
|
||||||
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
|
std::vector<int> n_pp = params.n_pp;
|
||||||
std::vector<int> n_tg = { 128, 256, };
|
std::vector<int> n_tg = params.n_tg;
|
||||||
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
|
std::vector<int> n_pl = params.n_pl;
|
||||||
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
|
|
||||||
|
|
||||||
if (argc >= 2) {
|
|
||||||
params.model = argv[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 3) {
|
|
||||||
n_kv_max = std::atoi(argv[2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 4) {
|
|
||||||
n_batch = std::atoi(argv[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 5) {
|
|
||||||
n_ubatch = std::atoi(argv[4]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 6) {
|
|
||||||
flash_attn = std::atoi(argv[5]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 7) {
|
|
||||||
is_pp_shared = std::atoi(argv[6]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 8) {
|
|
||||||
n_gpu_layers = std::atoi(argv[7]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 9) {
|
|
||||||
n_pp = parse_list(argv[8]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 10) {
|
|
||||||
n_tg = parse_list(argv[9]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 11) {
|
|
||||||
n_pl = parse_list(argv[10]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// init LLM
|
// init LLM
|
||||||
|
|
||||||
|
@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
const std::vector<float> t_split(llama_max_devices(), 0.0f);
|
|
||||||
|
|
||||||
model_params.n_gpu_layers = n_gpu_layers;
|
|
||||||
model_params.tensor_split = t_split.data();
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
ctx_params.seed = 1234;
|
|
||||||
ctx_params.n_ctx = n_kv_max;
|
|
||||||
ctx_params.n_batch = n_batch;
|
|
||||||
ctx_params.n_ubatch = n_ubatch;
|
|
||||||
ctx_params.flash_attn = flash_attn;
|
|
||||||
|
|
||||||
ctx_params.n_threads = params.n_threads;
|
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
||||||
|
|
||||||
// ensure enough sequences are available
|
// ensure enough sequences are available
|
||||||
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
||||||
|
@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int32_t n_kv_max = llama_n_ctx(ctx);
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||||
|
|
||||||
// decode in batches of ctx_params.n_batch tokens
|
// decode in batches of ctx_params.n_batch tokens
|
||||||
|
@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
.PHONY: build
|
.PHONY: build
|
||||||
|
|
||||||
build:
|
build:
|
||||||
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
rm -f ./batched_swift
|
rm -f ./llama-batched-swift
|
||||||
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
platforms: [.macOS(.v12)],
|
platforms: [.macOS(.v12)],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(name: "llama", path: "../../"),
|
.package(name: "llama", path: "../../"),
|
||||||
|
@ -13,7 +13,7 @@ let package = Package(
|
||||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
// Targets can depend on other targets in this package and products from dependencies.
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
.executableTarget(
|
.executableTarget(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
dependencies: ["llama"],
|
dependencies: ["llama"],
|
||||||
path: "Sources",
|
path: "Sources",
|
||||||
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
This is a swift clone of `examples/batched`.
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
$ `make`
|
$ `make`
|
||||||
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched)
|
set(TARGET llama-batched)
|
||||||
add_executable(${TARGET} batched.cpp)
|
add_executable(${TARGET} batched.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
The example demonstrates batched generation from a given prompt
|
The example demonstrates batched generation from a given prompt
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
|
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -7,48 +7,31 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
|
LOG_TEE("\nexample usage:\n");
|
||||||
|
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||||
|
LOG_TEE("\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (argc == 1 || argv[1][0] == '-') {
|
params.prompt = "Hello my name is";
|
||||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
|
params.n_predict = 32;
|
||||||
return 1 ;
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
print_usage(argc, argv, params);
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// number of parallel batches
|
// number of parallel batches
|
||||||
int n_parallel = 1;
|
int n_parallel = params.n_parallel;
|
||||||
|
|
||||||
// total length of the sequences including the prompt
|
// total length of the sequences including the prompt
|
||||||
int n_len = 32;
|
int n_predict = 32;
|
||||||
|
|
||||||
// number of layers to offload to the GPU
|
|
||||||
int n_gpu_layers = 0;
|
|
||||||
|
|
||||||
if (argc >= 2) {
|
|
||||||
params.model = argv[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 3) {
|
|
||||||
params.prompt = argv[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 4) {
|
|
||||||
n_parallel = std::atoi(argv[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 5) {
|
|
||||||
n_len = std::atoi(argv[4]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc >= 6) {
|
|
||||||
n_gpu_layers = std::atoi(argv[5]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.prompt.empty()) {
|
|
||||||
params.prompt = "Hello my name is";
|
|
||||||
}
|
|
||||||
|
|
||||||
string_process_escapes(params.prompt);
|
|
||||||
|
|
||||||
// init LLM
|
// init LLM
|
||||||
|
|
||||||
|
@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
model_params.n_gpu_layers = n_gpu_layers;
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
||||||
|
|
||||||
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
|
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
||||||
|
|
||||||
// initialize the context
|
// initialize the context
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
ctx_params.seed = 1234;
|
|
||||||
ctx_params.n_ctx = n_kv_req;
|
ctx_params.n_ctx = n_kv_req;
|
||||||
ctx_params.n_batch = std::max(n_len, n_parallel);
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
||||||
ctx_params.n_seq_max = n_parallel;
|
|
||||||
ctx_params.n_threads = params.n_threads;
|
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
@ -93,9 +70,9 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||||
|
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||||
if (n_kv_req > n_ctx) {
|
if (n_kv_req > n_ctx) {
|
||||||
|
@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
while (n_cur <= n_len) {
|
while (n_cur <= n_predict) {
|
||||||
// prepare the next batch
|
// prepare the next batch
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
|
||||||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
||||||
// is it an end of generation? -> mark the stream as finished
|
// is it an end of generation? -> mark the stream as finished
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
i_batch[i] = -1;
|
i_batch[i] = -1;
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
if (n_parallel > 1) {
|
if (n_parallel > 1) {
|
||||||
|
|
|
@ -1,188 +0,0 @@
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cinttypes>
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <ctime>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
||||||
#include <signal.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#elif defined (_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#include <signal.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Used for debugging to print out beam tokens.
|
|
||||||
struct ostream_beam_view {
|
|
||||||
llama_context * ctx;
|
|
||||||
llama_beam_view beam_view;
|
|
||||||
};
|
|
||||||
|
|
||||||
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
|
||||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
|
||||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
|
||||||
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
|
||||||
}
|
|
||||||
return os << ')';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Put here anything you want back in beam_search_callback().
|
|
||||||
struct beam_search_callback_data {
|
|
||||||
llama_context * ctx;
|
|
||||||
std::vector<llama_token> response;
|
|
||||||
};
|
|
||||||
|
|
||||||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
|
||||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
|
||||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
|
||||||
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Function matching type llama_beam_search_callback_fn_t.
|
|
||||||
// Custom callback example is called each time the beams lengths increase:
|
|
||||||
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
|
||||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
|
||||||
// This is also called when the stop condition is met.
|
|
||||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
|
||||||
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
|
||||||
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
|
||||||
// Mark beams as EOS as needed.
|
|
||||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
||||||
llama_beam_view& beam_view = beams_state.beam_views[i];
|
|
||||||
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
|
||||||
beam_view.eob = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf(","); // Show progress
|
|
||||||
if (const size_t n = beams_state.common_prefix_length) {
|
|
||||||
callback_data.response.resize(callback_data.response.size() + n);
|
|
||||||
assert(0u < beams_state.n_beams);
|
|
||||||
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
|
||||||
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
|
||||||
printf("%zu", n);
|
|
||||||
}
|
|
||||||
fflush(stdout);
|
|
||||||
#if 1 // DEBUG: print current beams for this iteration
|
|
||||||
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
|
||||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
||||||
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv)
|
|
||||||
{
|
|
||||||
gpt_params params;
|
|
||||||
//params.n_gpu_layers = 200;
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Print help :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
if ( argc < 2 || argv[1][0] == '-' )
|
|
||||||
{
|
|
||||||
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
|
||||||
return 1 ;
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Load parameters :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
params.model = argv[1];
|
|
||||||
|
|
||||||
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
|
||||||
|
|
||||||
if ( argc > 3 )
|
|
||||||
{
|
|
||||||
params.prompt = argv[3];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( params.prompt.empty() )
|
|
||||||
{
|
|
||||||
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Init LLM :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
llama_backend_init();
|
|
||||||
llama_numa_init(params.numa);
|
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
|
||||||
|
|
||||||
if ( model == NULL )
|
|
||||||
{
|
|
||||||
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Tokenize the prompt :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
|
||||||
|
|
||||||
const size_t max_context_size = llama_n_ctx( ctx );
|
|
||||||
const size_t max_tokens_list_size = max_context_size - 4 ;
|
|
||||||
|
|
||||||
if (tokens_list.size() > max_tokens_list_size)
|
|
||||||
{
|
|
||||||
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
|
||||||
__func__ , tokens_list.size() , max_tokens_list_size );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf( stderr, "\n\n" );
|
|
||||||
|
|
||||||
// Print the tokens from the prompt :
|
|
||||||
|
|
||||||
for( auto id : tokens_list )
|
|
||||||
{
|
|
||||||
std::cout << llama_token_to_piece(ctx, id);
|
|
||||||
}
|
|
||||||
std::cout << std::flush;
|
|
||||||
|
|
||||||
int n_past = 0;
|
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
|
|
||||||
{
|
|
||||||
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
n_past += tokens_list.size();
|
|
||||||
|
|
||||||
beam_search_callback_data callback_data{ctx, {}};
|
|
||||||
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
|
||||||
int const n_predict = 256;
|
|
||||||
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
|
|
||||||
|
|
||||||
std::cout << "\n\n";
|
|
||||||
for (llama_token const token_id : callback_data.response) {
|
|
||||||
std::cout << llama_token_to_piece(ctx,token_id);
|
|
||||||
}
|
|
||||||
std::cout << std::endl;
|
|
||||||
|
|
||||||
llama_free( ctx );
|
|
||||||
llama_free_model( model );
|
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET benchmark)
|
set(TARGET llama-bench-matmult)
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./main $GEN_OPTIONS \
|
./llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -62,7 +62,7 @@ fi
|
||||||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||||
echo 'Prompt cache does not exist, building...'
|
echo 'Prompt cache does not exist, building...'
|
||||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||||
./main 2>>"$LOG" \
|
./llama-cli 2>>"$LOG" \
|
||||||
--batch_size 64 \
|
--batch_size 64 \
|
||||||
"${OPTS[@]}" \
|
"${OPTS[@]}" \
|
||||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||||
|
@ -109,13 +109,13 @@ while read -e line; do
|
||||||
|
|
||||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||||
|
|
||||||
./main 2>>"$LOG" "${OPTS[@]}" \
|
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
|
||||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||||
--prompt-cache-all \
|
--prompt-cache-all \
|
||||||
--file "$CUR_PROMPT_FILE" \
|
--file "$CUR_PROMPT_FILE" \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--n_predict "$n_predict" |
|
--n_predict "$n_predict" |
|
||||||
skip_bytes 1 | # skip BOS token added by ./main
|
skip_bytes 1 | # skip BOS token added by ./llama-cli
|
||||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||||
skip_bytes "$n_prompt_len_pre" # print generation
|
skip_bytes "$n_prompt_len_pre" # print generation
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ while read -e line; do
|
||||||
# TODO get both messages in one go
|
# TODO get both messages in one go
|
||||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ while read -e line; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Update cache for next prompt in background, ideally during user input
|
# Update cache for next prompt in background, ideally during user input
|
||||||
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||||
--file "$NEXT_PROMPT_FILE" \
|
--file "$NEXT_PROMPT_FILE" \
|
||||||
--n_predict 1 &
|
--n_predict 1 &
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./bin/main $GEN_OPTIONS \
|
./bin/llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -11,6 +11,6 @@ cd ..
|
||||||
#
|
#
|
||||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||||
#
|
#
|
||||||
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||||
--repeat_penalty 1.0 --color -i \
|
--repeat_penalty 1.0 --color -i \
|
||||||
-r "User:" -f prompts/chat-with-bob.txt
|
-r "User:" -f prompts/chat-with-bob.txt
|
||||||
|
|
|
@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
|
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
# use .parent.parent since we are in "examples" directory
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing_extensions import Self, TypeAlias
|
from typing_extensions import Self, TypeAlias
|
||||||
|
@ -174,7 +176,7 @@ class Params:
|
||||||
rope_scaling_type: gguf.RopeScalingType | None = None
|
rope_scaling_type: gguf.RopeScalingType | None = None
|
||||||
f_rope_freq_base: float | None = None
|
f_rope_freq_base: float | None = None
|
||||||
f_rope_scale: float | None = None
|
f_rope_scale: float | None = None
|
||||||
n_orig_ctx: int | None = None
|
n_ctx_orig: int | None = None
|
||||||
rope_finetuned: bool | None = None
|
rope_finetuned: bool | None = None
|
||||||
|
|
||||||
ftype: GGMLFileType | None = None
|
ftype: GGMLFileType | None = None
|
||||||
|
@ -224,7 +226,7 @@ class Params:
|
||||||
with open(config_path) as f:
|
with open(config_path) as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
|
||||||
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
|
||||||
rope_scaling = config.get("rope_scaling")
|
rope_scaling = config.get("rope_scaling")
|
||||||
|
|
||||||
if rope_scaling is not None and (typ := rope_scaling.get("type")):
|
if rope_scaling is not None and (typ := rope_scaling.get("type")):
|
||||||
|
@ -234,7 +236,7 @@ class Params:
|
||||||
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
||||||
elif typ == "yarn":
|
elif typ == "yarn":
|
||||||
rope_scaling_type = gguf.RopeScalingType.YARN
|
rope_scaling_type = gguf.RopeScalingType.YARN
|
||||||
n_orig_ctx = rope_scaling['original_max_position_embeddings']
|
n_ctx_orig = rope_scaling['original_max_position_embeddings']
|
||||||
rope_finetuned = rope_scaling['finetuned']
|
rope_finetuned = rope_scaling['finetuned']
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
||||||
|
@ -270,7 +272,7 @@ class Params:
|
||||||
f_rope_freq_base = config.get("rope_theta"),
|
f_rope_freq_base = config.get("rope_theta"),
|
||||||
rope_scaling_type = rope_scaling_type,
|
rope_scaling_type = rope_scaling_type,
|
||||||
f_rope_scale = f_rope_scale,
|
f_rope_scale = f_rope_scale,
|
||||||
n_orig_ctx = n_orig_ctx,
|
n_ctx_orig = n_ctx_orig,
|
||||||
rope_finetuned = rope_finetuned,
|
rope_finetuned = rope_finetuned,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -380,306 +382,6 @@ class Metadata:
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# vocab
|
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class BaseVocab(Protocol):
|
|
||||||
tokenizer_model: ClassVar[str]
|
|
||||||
name: ClassVar[str]
|
|
||||||
|
|
||||||
|
|
||||||
class NoVocab(BaseVocab):
|
|
||||||
tokenizer_model = "no_vocab"
|
|
||||||
name = "no_vocab"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return "<NoVocab for a model without integrated vocabulary>"
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class Vocab(BaseVocab, Protocol):
|
|
||||||
vocab_size: int
|
|
||||||
added_tokens_dict: dict[str, int]
|
|
||||||
added_tokens_list: list[str]
|
|
||||||
fname_tokenizer: Path
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path): ...
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
|
|
||||||
|
|
||||||
|
|
||||||
class BpeVocab(Vocab):
|
|
||||||
tokenizer_model = "gpt2"
|
|
||||||
name = "bpe"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
added_tokens: dict[str, int] = {}
|
|
||||||
|
|
||||||
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
|
||||||
# "slow" tokenizer
|
|
||||||
with open(fname_tokenizer, encoding="utf-8") as f:
|
|
||||||
self.vocab = json.load(f)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
|
||||||
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
|
||||||
added_tokens = json.load(f)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# "fast" tokenizer
|
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
|
||||||
|
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
|
||||||
with open(fname_tokenizer, encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
|
||||||
if (
|
|
||||||
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
|
||||||
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
|
||||||
):
|
|
||||||
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
|
||||||
|
|
||||||
self.vocab = tokenizer_model["vocab"]
|
|
||||||
|
|
||||||
if (added := tokenizer_json.get('added_tokens')) is not None:
|
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
|
||||||
added_tokens = {item['content']: item['id']
|
|
||||||
for item in added
|
|
||||||
if item['content'] not in self.vocab}
|
|
||||||
|
|
||||||
vocab_size = len(self.vocab)
|
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
|
||||||
actual_ids = sorted(added_tokens.values())
|
|
||||||
if expected_ids != actual_ids:
|
|
||||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
|
||||||
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
|
|
||||||
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
|
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
|
||||||
self.added_tokens_dict = added_tokens
|
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
|
||||||
self.vocab_size_base = vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
|
||||||
|
|
||||||
for i, _ in enumerate(self.vocab):
|
|
||||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
score = -1000.0
|
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.bpe_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab(Vocab):
|
|
||||||
tokenizer_model = "llama"
|
|
||||||
name = "spm"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
added_tokens: dict[str, int] = {}
|
|
||||||
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
|
||||||
# normal location
|
|
||||||
try:
|
|
||||||
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
|
||||||
added_tokens = json.load(f)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
|
||||||
# not found in alternate location either
|
|
||||||
raise FileNotFoundError('Cannot find tokenizer.model')
|
|
||||||
|
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor()
|
|
||||||
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
|
|
||||||
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
|
||||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
|
||||||
actual_new_ids = sorted(new_tokens.keys())
|
|
||||||
|
|
||||||
if expected_new_ids != actual_new_ids:
|
|
||||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
|
||||||
self.added_tokens_dict = added_tokens
|
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
|
||||||
self.vocab_size_base = vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
piece = tokenizer.IdToPiece(i)
|
|
||||||
text = piece.encode("utf-8")
|
|
||||||
score: float = tokenizer.GetScore(i)
|
|
||||||
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
|
||||||
if tokenizer.IsUnknown(i):
|
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
|
||||||
if tokenizer.IsControl(i):
|
|
||||||
toktype = gguf.TokenType.CONTROL
|
|
||||||
|
|
||||||
# NOTE: I think added_tokens are user defined.
|
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
|
||||||
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
|
||||||
|
|
||||||
if tokenizer.IsUnused(i):
|
|
||||||
toktype = gguf.TokenType.UNUSED
|
|
||||||
if tokenizer.IsByte(i):
|
|
||||||
toktype = gguf.TokenType.BYTE
|
|
||||||
|
|
||||||
yield text, score, toktype
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
score = -1000.0
|
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.sentencepiece_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
class LlamaHfVocab(Vocab):
|
|
||||||
tokenizer_model = "llama"
|
|
||||||
name = "hfft"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
|
||||||
with open(fname_tokenizer, encoding='utf-8') as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
# pre-check so we know if we need transformers
|
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
|
||||||
is_llama3 = (
|
|
||||||
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
|
|
||||||
and not tokenizer_model.get('byte_fallback', True)
|
|
||||||
)
|
|
||||||
if is_llama3:
|
|
||||||
raise TypeError('Llama 3 must be converted with BpeVocab')
|
|
||||||
|
|
||||||
if not is_llama3 and (
|
|
||||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
|
||||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
|
||||||
):
|
|
||||||
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
|
||||||
|
|
||||||
try:
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
except ImportError as e:
|
|
||||||
raise ImportError(
|
|
||||||
"To use LlamaHfVocab, please install the `transformers` package. "
|
|
||||||
"You can install it with `pip install transformers`."
|
|
||||||
) from e
|
|
||||||
|
|
||||||
# Allow the tokenizer to default to slow or fast versions.
|
|
||||||
# Explicitly set tokenizer to use local paths.
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
base_path,
|
|
||||||
cache_dir=base_path,
|
|
||||||
local_files_only=True,
|
|
||||||
)
|
|
||||||
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
|
||||||
|
|
||||||
# Initialize lists and dictionaries for added tokens
|
|
||||||
self.added_tokens_list = []
|
|
||||||
self.added_tokens_dict = dict()
|
|
||||||
self.added_tokens_ids = set()
|
|
||||||
|
|
||||||
# Process added tokens
|
|
||||||
for tok, tokidx in sorted(
|
|
||||||
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
|
|
||||||
):
|
|
||||||
# Only consider added tokens that are not in the base vocabulary
|
|
||||||
if tokidx >= self.tokenizer.vocab_size:
|
|
||||||
self.added_tokens_list.append(tok)
|
|
||||||
self.added_tokens_dict[tok] = tokidx
|
|
||||||
self.added_tokens_ids.add(tokidx)
|
|
||||||
|
|
||||||
# Store special tokens and their IDs
|
|
||||||
self.specials = {
|
|
||||||
tok: self.tokenizer.get_vocab()[tok]
|
|
||||||
for tok in self.tokenizer.all_special_tokens
|
|
||||||
}
|
|
||||||
self.special_ids = set(self.tokenizer.all_special_ids)
|
|
||||||
|
|
||||||
# Set vocabulary sizes
|
|
||||||
self.vocab_size_base = self.tokenizer.vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
reverse_vocab = {
|
|
||||||
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
|
|
||||||
}
|
|
||||||
|
|
||||||
for token_id in range(self.vocab_size_base):
|
|
||||||
# Skip processing added tokens here
|
|
||||||
if token_id in self.added_tokens_ids:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Convert token text to bytes
|
|
||||||
token_text = reverse_vocab[token_id].encode("utf-8")
|
|
||||||
|
|
||||||
# Yield token text, score, and type
|
|
||||||
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
|
||||||
token_id, token_text, self.special_ids # Reuse already stored special IDs
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
|
|
||||||
# Special case for byte tokens
|
|
||||||
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
|
|
||||||
return gguf.TokenType.BYTE
|
|
||||||
|
|
||||||
# Determine token type based on whether it's a special token
|
|
||||||
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
|
||||||
|
|
||||||
def get_token_score(self, token_id: int) -> float:
|
|
||||||
# Placeholder for actual logic to determine the token's score
|
|
||||||
# This needs to be implemented based on specific requirements
|
|
||||||
return -1000.0 # Default score
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
if text in self.specials:
|
|
||||||
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
|
|
||||||
score = self.get_token_score(self.specials[text])
|
|
||||||
else:
|
|
||||||
toktype = gguf.TokenType.USER_DEFINED
|
|
||||||
score = -1000.0
|
|
||||||
|
|
||||||
yield text.encode("utf-8"), score, toktype
|
|
||||||
|
|
||||||
def has_newline_token(self):
|
|
||||||
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.hf_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# data loading
|
# data loading
|
||||||
# TODO: reuse (probably move to gguf.py?)
|
# TODO: reuse (probably move to gguf.py?)
|
||||||
|
@ -1162,8 +864,8 @@ class OutputFile:
|
||||||
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
|
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
|
||||||
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
|
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
|
||||||
|
|
||||||
if params.n_orig_ctx is not None:
|
if params.n_ctx_orig is not None:
|
||||||
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
|
self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
|
||||||
|
|
||||||
if params.rope_finetuned is not None:
|
if params.rope_finetuned is not None:
|
||||||
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
|
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET convert-llama2c-to-ggml)
|
set(TARGET llama-convert-llama2c-to-ggml)
|
||||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
|
||||||
|
|
||||||
After successful compilation, following usage options are available:
|
After successful compilation, following usage options are available:
|
||||||
```
|
```
|
||||||
usage: ./convert-llama2c-to-ggml [options]
|
usage: ./llama-convert-llama2c-to-ggml [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -19,10 +19,10 @@ options:
|
||||||
|
|
||||||
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||||
|
|
||||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||||
|
|
||||||
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
||||||
|
|
||||||
Now you can use the model with a command like:
|
Now you can use the model with a command like:
|
||||||
|
|
||||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||||
|
|
|
@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
|
||||||
|
|
||||||
params.samples_start_after_nl = false;
|
params.samples_start_after_nl = false;
|
||||||
params.use_adam = true;
|
params.use_adam = true;
|
||||||
params.use_flash = true;
|
params.use_flash = false;
|
||||||
params.use_scratch = true;
|
params.use_scratch = true;
|
||||||
|
|
||||||
// only adam
|
// only adam
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET beam-search)
|
set(TARGET llama-cvector-generator)
|
||||||
add_executable(${TARGET} beam-search.cpp)
|
add_executable(${TARGET} cvector-generator.cpp pca.hpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
34
examples/cvector-generator/README.md
Normal file
34
examples/cvector-generator/README.md
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# cvector-generator
|
||||||
|
|
||||||
|
This example demonstrates how to generate a control vector using gguf models.
|
||||||
|
|
||||||
|
Related PRs:
|
||||||
|
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
|
||||||
|
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
|
||||||
|
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# CPU only
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
|
||||||
|
|
||||||
|
# With GPU
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
||||||
|
|
||||||
|
# With advanced options
|
||||||
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
|
||||||
|
|
||||||
|
# To see help message
|
||||||
|
./cvector-generator -h
|
||||||
|
# Then, have a look at "cvector" section
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tips and tricks
|
||||||
|
|
||||||
|
If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
||||||
|
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
||||||
|
```
|
582
examples/cvector-generator/completions.txt
Normal file
582
examples/cvector-generator/completions.txt
Normal file
|
@ -0,0 +1,582 @@
|
||||||
|
|
||||||
|
That game
|
||||||
|
I can see
|
||||||
|
Hmm, this
|
||||||
|
I can relate to
|
||||||
|
Who is
|
||||||
|
I understand the
|
||||||
|
Ugh,
|
||||||
|
What the hell was
|
||||||
|
Hey, did anyone
|
||||||
|
Although
|
||||||
|
Thank you for choosing
|
||||||
|
What are you
|
||||||
|
Oh w
|
||||||
|
How dare you open
|
||||||
|
It was my pleasure
|
||||||
|
I'm hon
|
||||||
|
I appreciate that you
|
||||||
|
Are you k
|
||||||
|
Whoever left this
|
||||||
|
It's always
|
||||||
|
Ew,
|
||||||
|
Hey, I l
|
||||||
|
Hello? Is someone
|
||||||
|
I understand that
|
||||||
|
That poem
|
||||||
|
Aww, poor
|
||||||
|
Hey, it
|
||||||
|
Alright, who
|
||||||
|
I didn't
|
||||||
|
Well, life
|
||||||
|
The document
|
||||||
|
Oh no, this
|
||||||
|
I'm concerned
|
||||||
|
Hello, this is
|
||||||
|
This art
|
||||||
|
Hmm, this drink
|
||||||
|
Hi there!
|
||||||
|
It seems
|
||||||
|
Is
|
||||||
|
Good
|
||||||
|
I can't
|
||||||
|
Ex
|
||||||
|
Who are
|
||||||
|
I can see that
|
||||||
|
Wow,
|
||||||
|
Today is a
|
||||||
|
Hey friend
|
||||||
|
Sometimes friends
|
||||||
|
Oh, this old
|
||||||
|
The weather outside
|
||||||
|
This place is sur
|
||||||
|
I appreciate your input
|
||||||
|
Thank you for the
|
||||||
|
Look at
|
||||||
|
I'm disappoint
|
||||||
|
To my
|
||||||
|
How dare you
|
||||||
|
That's an
|
||||||
|
This piece of art
|
||||||
|
Eww
|
||||||
|
This park is
|
||||||
|
This is incredible
|
||||||
|
Oh no, someone
|
||||||
|
Exc
|
||||||
|
Well, it'
|
||||||
|
I warned
|
||||||
|
Hey, I understand
|
||||||
|
Hey, I saw
|
||||||
|
How dare you go
|
||||||
|
What the he
|
||||||
|
Hey
|
||||||
|
It's
|
||||||
|
Hello? Hello?
|
||||||
|
It
|
||||||
|
Oh no!
|
||||||
|
This is the perfect
|
||||||
|
Good morning,
|
||||||
|
Oh no, there
|
||||||
|
It's so
|
||||||
|
Yeah
|
||||||
|
Uh,
|
||||||
|
Hello everyone
|
||||||
|
Who turned off
|
||||||
|
The weather
|
||||||
|
Who'
|
||||||
|
Hey, this
|
||||||
|
Wait,
|
||||||
|
Eww, gross
|
||||||
|
Excuse
|
||||||
|
It seems like you
|
||||||
|
Thank you so
|
||||||
|
What happened?
|
||||||
|
Oh my g
|
||||||
|
I am deeply sad
|
||||||
|
I war
|
||||||
|
Okay, let'
|
||||||
|
Hey, that
|
||||||
|
That was a beautiful
|
||||||
|
Oh no! That
|
||||||
|
What happened
|
||||||
|
Hey there
|
||||||
|
The artist'
|
||||||
|
What?!
|
||||||
|
Hey, it'
|
||||||
|
I am disappoint
|
||||||
|
It seems like
|
||||||
|
Oh no! The
|
||||||
|
This park is a
|
||||||
|
If you
|
||||||
|
Yes! I did
|
||||||
|
It sounds
|
||||||
|
What
|
||||||
|
Who is it
|
||||||
|
Hmm, that
|
||||||
|
That's strange
|
||||||
|
Yeah, that was
|
||||||
|
That's interesting
|
||||||
|
This park
|
||||||
|
What the hell
|
||||||
|
Who is that
|
||||||
|
I feel like my
|
||||||
|
Oh well
|
||||||
|
What the hell is
|
||||||
|
Hello? Hello
|
||||||
|
To my dearest
|
||||||
|
Bless you!\"
|
||||||
|
Thank you for
|
||||||
|
Oh, looks like
|
||||||
|
Can you please
|
||||||
|
This place is
|
||||||
|
Eww, what
|
||||||
|
Bless you
|
||||||
|
Is everything
|
||||||
|
Hey, I just
|
||||||
|
Whoever left these
|
||||||
|
Well, that'
|
||||||
|
I feel
|
||||||
|
Hey, do you
|
||||||
|
It's sad
|
||||||
|
Oh no, it
|
||||||
|
Hey, that'
|
||||||
|
Oh my god,
|
||||||
|
Thank you,
|
||||||
|
Hello little one,
|
||||||
|
I apolog
|
||||||
|
Hey team, I
|
||||||
|
How dare you read
|
||||||
|
Who is this and
|
||||||
|
Whoever left
|
||||||
|
Hi there! W
|
||||||
|
A
|
||||||
|
If you have
|
||||||
|
I was
|
||||||
|
U
|
||||||
|
Bless
|
||||||
|
Well, this
|
||||||
|
Oh, I'
|
||||||
|
It's a
|
||||||
|
Eww,
|
||||||
|
Is everything okay?
|
||||||
|
Oh, I
|
||||||
|
Hello, can you
|
||||||
|
Al
|
||||||
|
That was a great
|
||||||
|
What are
|
||||||
|
I understand that not
|
||||||
|
Oh no, not
|
||||||
|
Who is it?\"
|
||||||
|
Hey, can we
|
||||||
|
Whoever is taking
|
||||||
|
I would love to
|
||||||
|
Hey, I noticed
|
||||||
|
Hey, could
|
||||||
|
I understand that there
|
||||||
|
Hello?
|
||||||
|
D
|
||||||
|
Oh man, I
|
||||||
|
Thank you so much
|
||||||
|
Oh no, my
|
||||||
|
Dear [Name
|
||||||
|
Uh
|
||||||
|
I remember
|
||||||
|
Hey, who
|
||||||
|
Well, it
|
||||||
|
Are you
|
||||||
|
I understand that it
|
||||||
|
Hey, is
|
||||||
|
I would
|
||||||
|
Who is this
|
||||||
|
Excuse me
|
||||||
|
Alright
|
||||||
|
I am thrilled
|
||||||
|
Sometimes friends have
|
||||||
|
Who the
|
||||||
|
It's interesting
|
||||||
|
I would love
|
||||||
|
E
|
||||||
|
Hello? Is anyone
|
||||||
|
Well, this is
|
||||||
|
This place
|
||||||
|
Well,
|
||||||
|
I warned you
|
||||||
|
Hey, watch where
|
||||||
|
Oh my
|
||||||
|
That'
|
||||||
|
Sometimes friends have different
|
||||||
|
I understand that everyone
|
||||||
|
What?
|
||||||
|
What do these notes
|
||||||
|
I can relate
|
||||||
|
I'm not
|
||||||
|
I understand
|
||||||
|
To my dear
|
||||||
|
Guys
|
||||||
|
Well
|
||||||
|
Hey, I appreciate
|
||||||
|
Wow, what
|
||||||
|
Dear
|
||||||
|
That melody
|
||||||
|
Who the hell
|
||||||
|
Today is
|
||||||
|
Hello little
|
||||||
|
Wow, look
|
||||||
|
That's great
|
||||||
|
Love is never wrong
|
||||||
|
I'm having
|
||||||
|
Whoa, did
|
||||||
|
Ugh
|
||||||
|
Can you please provide
|
||||||
|
I miss you,
|
||||||
|
I feel uncom
|
||||||
|
I know
|
||||||
|
Ugh, this
|
||||||
|
Hey, watch
|
||||||
|
Oh great, a
|
||||||
|
I didn
|
||||||
|
Okay
|
||||||
|
That game of char
|
||||||
|
Oh
|
||||||
|
I appreciate
|
||||||
|
Who's there
|
||||||
|
I am so
|
||||||
|
Oh great, someone
|
||||||
|
Hey, could you
|
||||||
|
I remember wondering
|
||||||
|
Wait, what?
|
||||||
|
What do
|
||||||
|
Hello? Can
|
||||||
|
Hey there,
|
||||||
|
That game of
|
||||||
|
This is incred
|
||||||
|
Oh my gosh
|
||||||
|
Oh great, f
|
||||||
|
I appreciate your
|
||||||
|
It sounds like
|
||||||
|
What the heck
|
||||||
|
Okay, I understand
|
||||||
|
Ew
|
||||||
|
I understand that this
|
||||||
|
Uh, hi
|
||||||
|
Hi everyone!
|
||||||
|
What the hell?
|
||||||
|
Thank you for your
|
||||||
|
Oh no, the
|
||||||
|
Wow, I
|
||||||
|
Who turned
|
||||||
|
Dear [
|
||||||
|
Whoever
|
||||||
|
This is a
|
||||||
|
Whoa, he
|
||||||
|
What in the world
|
||||||
|
Although the physical
|
||||||
|
Hello, who is
|
||||||
|
That's amaz
|
||||||
|
Hey, I know
|
||||||
|
Okay, that
|
||||||
|
Hi everyone
|
||||||
|
Hey, is everything
|
||||||
|
I understand your fr
|
||||||
|
Oh no, poor
|
||||||
|
Oh, look
|
||||||
|
Good morning
|
||||||
|
Ew, gross
|
||||||
|
Oh no, did
|
||||||
|
Look at the family
|
||||||
|
Hey team
|
||||||
|
Yes!
|
||||||
|
Hey, can I
|
||||||
|
Okay, that'
|
||||||
|
It's great
|
||||||
|
Love is
|
||||||
|
Hey, what
|
||||||
|
Good morning, world
|
||||||
|
Who is it?
|
||||||
|
That poem really reson
|
||||||
|
I
|
||||||
|
That's
|
||||||
|
I understand the task
|
||||||
|
Gu
|
||||||
|
Hello? Who'
|
||||||
|
This postcard is
|
||||||
|
Whoa,
|
||||||
|
Oh, that
|
||||||
|
I understand that I
|
||||||
|
Whoever is
|
||||||
|
Hello? Who is
|
||||||
|
I'm really
|
||||||
|
Wow, this
|
||||||
|
Can
|
||||||
|
This artwork really
|
||||||
|
This is a shame
|
||||||
|
I miss you too
|
||||||
|
Who are you?
|
||||||
|
Today is a difficult
|
||||||
|
Hey, just
|
||||||
|
Are you okay
|
||||||
|
I am
|
||||||
|
Hi,
|
||||||
|
Wow, that
|
||||||
|
Hey there! Can
|
||||||
|
Okay, stay
|
||||||
|
Oh great, just
|
||||||
|
Yeah,
|
||||||
|
Hello? Can you
|
||||||
|
Oh, looks
|
||||||
|
Thank you for sharing
|
||||||
|
I'm glad
|
||||||
|
Hey, is that
|
||||||
|
Hmm
|
||||||
|
It was my
|
||||||
|
It sounds like you
|
||||||
|
Wow, your
|
||||||
|
I was promised certain
|
||||||
|
That was such a
|
||||||
|
Thank
|
||||||
|
Excuse you
|
||||||
|
That was
|
||||||
|
Hey team,
|
||||||
|
I feel un
|
||||||
|
It was
|
||||||
|
What'
|
||||||
|
Hey friend, I
|
||||||
|
How
|
||||||
|
Saying goodbye
|
||||||
|
That
|
||||||
|
It's heart
|
||||||
|
How dare
|
||||||
|
Oh,
|
||||||
|
Hello, may
|
||||||
|
What's this
|
||||||
|
Thank you for recogn
|
||||||
|
Aww, that
|
||||||
|
Oh, I remember
|
||||||
|
Hmm, that'
|
||||||
|
I miss
|
||||||
|
I know this
|
||||||
|
Wait
|
||||||
|
Is everything okay
|
||||||
|
Who is that person
|
||||||
|
Wow, you
|
||||||
|
Oh great
|
||||||
|
I'm sad
|
||||||
|
Wow, the
|
||||||
|
I am very disappoint
|
||||||
|
Who turned off the
|
||||||
|
I understand that things
|
||||||
|
I'm very
|
||||||
|
Hi
|
||||||
|
That's very
|
||||||
|
Okay, I
|
||||||
|
Oh no,
|
||||||
|
Wow, there
|
||||||
|
What's wrong
|
||||||
|
I apologize for
|
||||||
|
Hey, I
|
||||||
|
Can I help you
|
||||||
|
Oh, I didn
|
||||||
|
Alright,
|
||||||
|
Oh wow,
|
||||||
|
Oh my goodness
|
||||||
|
I know this event
|
||||||
|
What in the
|
||||||
|
Saying
|
||||||
|
Yeah, that
|
||||||
|
Guys, I
|
||||||
|
Hey, this v
|
||||||
|
This post
|
||||||
|
Are
|
||||||
|
Hey, can
|
||||||
|
Hello? Is
|
||||||
|
I can only imagine
|
||||||
|
Oh, that sounds
|
||||||
|
Hey, is anyone
|
||||||
|
I am disappointed
|
||||||
|
Hello,
|
||||||
|
Hey everyone, I
|
||||||
|
That was such
|
||||||
|
It's okay
|
||||||
|
The artist
|
||||||
|
Whoa
|
||||||
|
I understand that mistakes
|
||||||
|
Can I help
|
||||||
|
Who
|
||||||
|
Hi everyone! I
|
||||||
|
Hey, can you
|
||||||
|
Wow, how
|
||||||
|
Today
|
||||||
|
Oh no, I
|
||||||
|
Oh well, I
|
||||||
|
Well, that
|
||||||
|
This is the
|
||||||
|
Yes! I finally
|
||||||
|
Hey there little
|
||||||
|
Hello everyone!
|
||||||
|
Love is never
|
||||||
|
Look at the
|
||||||
|
This postcard
|
||||||
|
Oh great,
|
||||||
|
Can I
|
||||||
|
Hmm, this is
|
||||||
|
I understand your
|
||||||
|
Oh, look at
|
||||||
|
B
|
||||||
|
I'm so
|
||||||
|
Whoa, this
|
||||||
|
W
|
||||||
|
Oh, this
|
||||||
|
Sometimes
|
||||||
|
This piece of
|
||||||
|
What the
|
||||||
|
That was a
|
||||||
|
Hey, do
|
||||||
|
Oh no
|
||||||
|
Whoa, what
|
||||||
|
I feel like I
|
||||||
|
The documentary
|
||||||
|
Hello
|
||||||
|
Hello little one
|
||||||
|
I understand that my
|
||||||
|
Eww, that
|
||||||
|
Wow, an
|
||||||
|
Yes! Finally,
|
||||||
|
Although the physical location
|
||||||
|
Whoever is watching
|
||||||
|
That movie
|
||||||
|
I remember wondering about
|
||||||
|
Hey there, little
|
||||||
|
Who's
|
||||||
|
Hello, who
|
||||||
|
Hello everyone! Thank
|
||||||
|
Hello, can
|
||||||
|
That's too
|
||||||
|
Hey, just wanted
|
||||||
|
Hey there, I
|
||||||
|
Saying good
|
||||||
|
Hey there!
|
||||||
|
Who is there?
|
||||||
|
Oh my good
|
||||||
|
I am very
|
||||||
|
Oh no, what
|
||||||
|
Wow, thank
|
||||||
|
I was promised
|
||||||
|
Hi, is
|
||||||
|
Hey, I'
|
||||||
|
Guys, the
|
||||||
|
Oh no, that
|
||||||
|
Who is there
|
||||||
|
Hello, this
|
||||||
|
That movie really touched
|
||||||
|
If you have something
|
||||||
|
The documentary was
|
||||||
|
I'm starting
|
||||||
|
Are you kidd
|
||||||
|
That movie really
|
||||||
|
Hey everyone,
|
||||||
|
Thank you for considering
|
||||||
|
I didn'
|
||||||
|
Yes! I
|
||||||
|
Can you
|
||||||
|
Oh my god
|
||||||
|
Hey, whoever
|
||||||
|
That melody really
|
||||||
|
Thank you, little
|
||||||
|
Hello, may I
|
||||||
|
Look
|
||||||
|
Wow, we
|
||||||
|
It looks
|
||||||
|
What do these
|
||||||
|
Oh wow
|
||||||
|
I apologize
|
||||||
|
What are you all
|
||||||
|
It's such
|
||||||
|
It's clear
|
||||||
|
Hey, I was
|
||||||
|
Hey friend,
|
||||||
|
I can only
|
||||||
|
The weather outside is
|
||||||
|
Eww, this
|
||||||
|
I miss you
|
||||||
|
Wow
|
||||||
|
Aww,
|
||||||
|
Hi, is there
|
||||||
|
This artwork
|
||||||
|
Okay,
|
||||||
|
Oh well,
|
||||||
|
This
|
||||||
|
I'
|
||||||
|
Say
|
||||||
|
Hey there little gu
|
||||||
|
Hmm,
|
||||||
|
Whoa, who
|
||||||
|
I am thr
|
||||||
|
Oh man
|
||||||
|
Okay, stay calm
|
||||||
|
I'm happy
|
||||||
|
Oh, this cur
|
||||||
|
Oh man,
|
||||||
|
I'm sorry
|
||||||
|
Hello? Who
|
||||||
|
What?! That
|
||||||
|
This piece
|
||||||
|
Hey everyone
|
||||||
|
That's so
|
||||||
|
Are you okay?
|
||||||
|
What happened? Where
|
||||||
|
Hi there
|
||||||
|
The
|
||||||
|
Who the hell entered
|
||||||
|
I can
|
||||||
|
Guys,
|
||||||
|
What's
|
||||||
|
What in
|
||||||
|
It's important
|
||||||
|
I'm
|
||||||
|
I'm coming
|
||||||
|
It'
|
||||||
|
Yes! Finally
|
||||||
|
Wait, what
|
||||||
|
Wow, reading
|
||||||
|
I'm surprised
|
||||||
|
Hey, did
|
||||||
|
Hey,
|
||||||
|
Okay, let
|
||||||
|
I understand that you
|
||||||
|
Who the hell threw
|
||||||
|
Eww, who
|
||||||
|
Thank you for thinking
|
||||||
|
Who is this?\"
|
||||||
|
I am deeply
|
||||||
|
Thank you for including
|
||||||
|
Oh no, an
|
||||||
|
It looks like you
|
||||||
|
Aww
|
||||||
|
I'm confused
|
||||||
|
Wow, it
|
||||||
|
That poem really
|
||||||
|
Yes
|
||||||
|
Hey there, is
|
||||||
|
Hey, what'
|
||||||
|
Thank you for remember
|
||||||
|
To
|
||||||
|
This is
|
||||||
|
Thank you for making
|
||||||
|
I can'
|
||||||
|
That mel
|
||||||
|
Wow, they
|
||||||
|
I feel like
|
||||||
|
Although the
|
||||||
|
Who are you
|
||||||
|
Love
|
||||||
|
If
|
||||||
|
What the hell are
|
||||||
|
I am so sad
|
||||||
|
Oh, I found
|
||||||
|
Thank you
|
||||||
|
It looks like
|
||||||
|
Well, life is
|
||||||
|
I appreciate that
|
||||||
|
The artist's
|
||||||
|
Whoa, that
|
||||||
|
It's never
|
499
examples/cvector-generator/cvector-generator.cpp
Normal file
499
examples/cvector-generator/cvector-generator.cpp
Normal file
|
@ -0,0 +1,499 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "pca.hpp"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
// utils
|
||||||
|
|
||||||
|
template <class Iter>
|
||||||
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
|
std::string ret;
|
||||||
|
for (; begin != end; ++begin) {
|
||||||
|
ret += llama_token_to_piece(ctx, *begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
|
printf("\nexample usage:\n");
|
||||||
|
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
||||||
|
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
|
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
// cb_eval is reused for each pair of positive - negative prompt
|
||||||
|
struct callback_data {
|
||||||
|
ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
|
||||||
|
|
||||||
|
int n_layers = 0;
|
||||||
|
int n_tokens = 0;
|
||||||
|
bool is_eval_pos = true;
|
||||||
|
|
||||||
|
// each element of the vector correspond to one layer
|
||||||
|
std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
|
||||||
|
std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
|
||||||
|
std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
|
||||||
|
|
||||||
|
// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
|
||||||
|
void save_tensor_for_layer(struct ggml_tensor * t) {
|
||||||
|
GGML_ASSERT(t->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
if (ctx_ggml == nullptr) {
|
||||||
|
// alloc a new ctx_ggml if needed
|
||||||
|
struct ggml_init_params params_ggml = {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx_ggml = ggml_init(params_ggml);
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy tensor data
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
|
||||||
|
t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
|
||||||
|
ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
|
||||||
|
ggml_set_name(t_layer, ggml_get_name(t));
|
||||||
|
//print_debug_tensor(t_layer);
|
||||||
|
|
||||||
|
if (is_eval_pos) {
|
||||||
|
v_pos.push_back(t_layer);
|
||||||
|
} else {
|
||||||
|
v_neg.push_back(t_layer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate diff (v_pos - v_neg) and place the result back to v_pos
|
||||||
|
// all zero rows in the diff tensor will also be removed
|
||||||
|
// NOTE: final layer is ignored. we only have (n_layers - 1) to process
|
||||||
|
std::vector<struct ggml_tensor *> calc_diff() {
|
||||||
|
for (float il = 0; il < v_pos.size(); il++) {
|
||||||
|
float * a = (float *) v_pos[il]->data;
|
||||||
|
float * b = (float *) v_neg[il]->data;
|
||||||
|
size_t n_elem = ggml_nelements(v_pos[il]);
|
||||||
|
for (size_t j = 0; j < n_elem; j++) {
|
||||||
|
a[j] -= b[j];
|
||||||
|
}
|
||||||
|
//print_debug_tensor(v_pos[i]);
|
||||||
|
auto diff_filtered = filter_nonzero_rows(v_pos[il]);
|
||||||
|
v_diff_filtered.push_back(diff_filtered);
|
||||||
|
}
|
||||||
|
return v_diff_filtered; // for convinient, we return the result std::vector
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete zero rows from a given 2D tensor
|
||||||
|
struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
|
||||||
|
//printf("filter_nonzero_rows\n");
|
||||||
|
auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
|
||||||
|
// check if given row containing all zero elements
|
||||||
|
int n_cols = t->ne[0]; // hint: should be equal to n_embd
|
||||||
|
for (int col = 0; col < n_cols; ++col) {
|
||||||
|
if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
|
||||||
|
for (int i_row = 0; i_row < a->ne[1]; i_row++) {
|
||||||
|
if (!is_row_all_zeros(a, i_row, 1e-6)) {
|
||||||
|
rows_to_copy.push_back(i_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get "n_nonzero_rows" for the output "diff_filtered"
|
||||||
|
int n_nonzero_rows = rows_to_copy.size();
|
||||||
|
//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
|
||||||
|
int n_embd = a->ne[0];
|
||||||
|
GGML_ASSERT(n_nonzero_rows > 0);
|
||||||
|
|
||||||
|
// diff_filtered: [n_embd, n_nonzero_rows]
|
||||||
|
struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
|
||||||
|
ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
|
||||||
|
ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
|
||||||
|
diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
|
||||||
|
|
||||||
|
// copy non-zero rows
|
||||||
|
for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
|
||||||
|
int src_row = rows_to_copy[dest_row];
|
||||||
|
for (int i = 0; i < n_embd; i++) {
|
||||||
|
float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
|
||||||
|
ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//print_debug_tensor(diff_filtered);
|
||||||
|
|
||||||
|
return diff_filtered;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
|
||||||
|
void reset() {
|
||||||
|
for (auto ptr : v_pos) free(ptr->data);
|
||||||
|
for (auto ptr : v_neg) free(ptr->data);
|
||||||
|
for (auto ptr : v_diff_filtered) free(ptr->data);
|
||||||
|
v_pos.clear();
|
||||||
|
v_neg.clear();
|
||||||
|
v_diff_filtered.clear();
|
||||||
|
if (ctx_ggml) {
|
||||||
|
ggml_free(ctx_ggml);
|
||||||
|
}
|
||||||
|
ctx_ggml = nullptr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* process_ctx is used to store the ggml context for pre-post processing the diff vectors
|
||||||
|
* in short, input => v_diff and output => v_final
|
||||||
|
*/
|
||||||
|
struct train_context {
|
||||||
|
ggml_context * ctx_ggml;
|
||||||
|
int n_embd;
|
||||||
|
int n_layers;
|
||||||
|
|
||||||
|
/* pair of prompts to be used for generating final vector */
|
||||||
|
std::vector<std::string> positive_entries;
|
||||||
|
std::vector<std::string> negative_entries;
|
||||||
|
|
||||||
|
// each element of the vector correspond to one layer
|
||||||
|
// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
|
||||||
|
// NOTE (2): v_diff is transposed from v_diff_tmp
|
||||||
|
std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
|
||||||
|
std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
|
||||||
|
|
||||||
|
// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
|
||||||
|
// v_diff_tmp will get converted unto v_diff later on
|
||||||
|
std::vector<std::vector<uint8_t>> v_diff_tmp;
|
||||||
|
|
||||||
|
train_context(int n_embd_, int n_layers_) {
|
||||||
|
n_embd = n_embd_;
|
||||||
|
n_layers = n_layers_;
|
||||||
|
struct ggml_init_params params_ggml = {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx_ggml = ggml_init(params_ggml);
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
std::vector<uint8_t> empty;
|
||||||
|
v_diff_tmp.push_back(empty);
|
||||||
|
auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
|
||||||
|
t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
|
||||||
|
v_final.push_back(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add new rows into existing tensor in v_diff_tmp
|
||||||
|
void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
|
||||||
|
GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
auto t = diff_filtered[il];
|
||||||
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
|
size_t curr_size = diff_tmp.size();
|
||||||
|
diff_tmp.resize(curr_size + ggml_nbytes(t));
|
||||||
|
memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
||||||
|
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
||||||
|
void build_v_diff() {
|
||||||
|
printf("build_v_diff\n");
|
||||||
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
|
int n_elem = diff_tmp.size() / sizeof(float);
|
||||||
|
GGML_ASSERT(n_elem % n_embd == 0);
|
||||||
|
int n_rows = n_elem / n_embd;
|
||||||
|
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
|
||||||
|
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
||||||
|
// copy data & transpose
|
||||||
|
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
||||||
|
float * arr = (float *) diff_tmp.data();
|
||||||
|
for (int ir = 0; ir < n_rows; ++ir) {
|
||||||
|
for (int ic = 0; ic < n_embd; ++ic) {
|
||||||
|
float f = arr[ir*n_embd + ic];
|
||||||
|
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
v_diff.push_back(diff);
|
||||||
|
print_debug_tensor(diff);
|
||||||
|
// free memory of diff_tmp
|
||||||
|
diff_tmp.resize(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~train_context() {
|
||||||
|
for (auto ptr : v_final) free(ptr->data);
|
||||||
|
for (auto ptr : v_diff) free(ptr->data);
|
||||||
|
// no need to free v_diff_tmp, since we didn't use malloc
|
||||||
|
ggml_free(ctx_ggml);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct tokenized_prompt {
|
||||||
|
std::vector<llama_token> tokens_pos;
|
||||||
|
std::vector<llama_token> tokens_neg;
|
||||||
|
size_t max_seq_len;
|
||||||
|
|
||||||
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
|
||||||
|
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
|
||||||
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||||
|
padding_seq(ctx, tokens_pos, max_seq_len);
|
||||||
|
padding_seq(ctx, tokens_neg, max_seq_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
|
||||||
|
// TODO: customize padding token
|
||||||
|
std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
|
||||||
|
llama_token pad_tok = pad_tokens.back();
|
||||||
|
while (tokens.size() < len) {
|
||||||
|
tokens.push_back(pad_tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static std::string to_string(const T & val) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << val;
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
|
||||||
|
std::vector<std::string> output;
|
||||||
|
std::ifstream file(path);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(file, line)) {
|
||||||
|
bool is_skip = skip_empty_lines && line.empty();
|
||||||
|
if (!is_skip) {
|
||||||
|
string_process_escapes(line);
|
||||||
|
output.push_back(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
|
||||||
|
static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
auto * cb_data = (callback_data *) user_data;
|
||||||
|
static const char * l_out_name = "l_out";
|
||||||
|
const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
|
||||||
|
|
||||||
|
if (ask) {
|
||||||
|
return is_l_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// save the tensor to current context
|
||||||
|
cb_data->save_tensor_for_layer(t);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
|
||||||
|
struct gguf_context * ctx = gguf_init_empty();
|
||||||
|
|
||||||
|
const std::string arch = "controlvector";
|
||||||
|
gguf_set_val_str(ctx, "general.architecture", arch.c_str());
|
||||||
|
gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
|
||||||
|
gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < v_ctrl.size(); ++i) {
|
||||||
|
gguf_add_tensor(ctx, v_ctrl[i]);
|
||||||
|
print_debug_tensor(v_ctrl[i]);
|
||||||
|
printf("Added tensor: %s\n", v_ctrl[i]->name);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: writing file...\n", __func__);
|
||||||
|
gguf_write_to_file(ctx, fname.c_str(), false);
|
||||||
|
printf("%s: wrote file '%s'\n", __func__, fname.c_str());
|
||||||
|
gguf_free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load prompt files and completion file.
|
||||||
|
* Then format each pair of prompt + completion to make an entry.
|
||||||
|
*/
|
||||||
|
static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
|
// load prompts
|
||||||
|
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
|
||||||
|
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
|
||||||
|
if (positive_prompts.size() != negative_prompts.size()) {
|
||||||
|
fprintf(stderr, "number of positive and negative prompts must be equal\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (positive_prompts.empty()) {
|
||||||
|
fprintf(stderr, "must provide at least one prompt pair\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create templated prompts
|
||||||
|
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
||||||
|
auto format_template = [](std::string persona, std::string suffix) {
|
||||||
|
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
|
||||||
|
return persona + " " + suffix;
|
||||||
|
};
|
||||||
|
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
||||||
|
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
||||||
|
// TODO replicate the truncations done by the python implementation
|
||||||
|
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
|
||||||
|
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
print_usage(argc, argv, params);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.n_pca_iterations % params.n_pca_batch != 0) {
|
||||||
|
fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
callback_data cb_data;
|
||||||
|
|
||||||
|
// pass the callback to the backend scheduler
|
||||||
|
// it will be executed for each node during the graph computation
|
||||||
|
params.cb_eval = cb_eval;
|
||||||
|
params.cb_eval_user_data = &cb_data;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
|
print_build_info();
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
// load the model to get hparams
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// int n_ctx = llama_n_ctx(ctx);
|
||||||
|
int n_layers = llama_n_layer(model);
|
||||||
|
int n_embd = llama_n_embd(model);
|
||||||
|
// get model hint param (a.k.a model arch name)
|
||||||
|
char model_hint[128];
|
||||||
|
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
|
||||||
|
|
||||||
|
// init train_context
|
||||||
|
train_context ctx_train(n_embd, n_layers);
|
||||||
|
|
||||||
|
// load and prepare entries for training
|
||||||
|
prepare_entries(params, ctx_train);
|
||||||
|
|
||||||
|
// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
|
||||||
|
std::vector<tokenized_prompt> tokenized_prompts;
|
||||||
|
size_t n_total_tokens = 0;
|
||||||
|
for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
|
||||||
|
tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
|
||||||
|
n_total_tokens += 2 * t.max_seq_len;
|
||||||
|
tokenized_prompts.push_back(std::move(t));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
|
||||||
|
|
||||||
|
for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
|
||||||
|
bool success = false;
|
||||||
|
tokenized_prompt t = tokenized_prompts[i];
|
||||||
|
cb_data.n_layers = n_layers;
|
||||||
|
cb_data.n_tokens = t.max_seq_len;
|
||||||
|
|
||||||
|
printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
|
||||||
|
(int) i+1, (int) ctx_train.positive_entries.size(),
|
||||||
|
tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
|
||||||
|
tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
|
||||||
|
(int) t.max_seq_len);
|
||||||
|
|
||||||
|
cb_data.is_eval_pos = true;
|
||||||
|
success = get_hidden_layers(ctx, t.tokens_pos);
|
||||||
|
if (!success) break;
|
||||||
|
|
||||||
|
cb_data.is_eval_pos = false;
|
||||||
|
success = get_hidden_layers(ctx, t.tokens_neg);
|
||||||
|
if (!success) break;
|
||||||
|
|
||||||
|
// calculate diff and remove all zero rows
|
||||||
|
auto v_diff_filtered = cb_data.calc_diff();
|
||||||
|
|
||||||
|
// save & concat the filtered v_diff to ctx_train
|
||||||
|
ctx_train.concat_diff_tmp(v_diff_filtered);
|
||||||
|
|
||||||
|
// reset for next iteration
|
||||||
|
cb_data.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
// done with the model, we can now free it to make gain some memory
|
||||||
|
printf("Done evaluate prompts, unload model...\n");
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
// prepare ctx_train for PCA
|
||||||
|
ctx_train.build_v_diff();
|
||||||
|
|
||||||
|
// run PCA
|
||||||
|
PCA::pca_params pca_params;
|
||||||
|
pca_params.n_threads = params.n_threads;
|
||||||
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
|
||||||
|
// write output vectors to gguf
|
||||||
|
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
1
examples/cvector-generator/negative.txt
Normal file
1
examples/cvector-generator/negative.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
[INST] Act like a person who is extremely sad. [/INST]
|
322
examples/cvector-generator/pca.hpp
Normal file
322
examples/cvector-generator/pca.hpp
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <ctime>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
#define DEBUG_POS 5
|
||||||
|
|
||||||
|
static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
|
||||||
|
printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
|
||||||
|
if (!with_data) return;
|
||||||
|
printf("%s: %s[0] = [", __func__, t->name);
|
||||||
|
for (size_t i = 0; i <= DEBUG_POS; i++) {
|
||||||
|
printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
|
||||||
|
}
|
||||||
|
printf(" ... ]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace PCA {
|
||||||
|
|
||||||
|
// input params for PCA computations
|
||||||
|
struct pca_params {
|
||||||
|
int n_threads = 1;
|
||||||
|
int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
|
||||||
|
int n_iterations = 1000;
|
||||||
|
float tolerance = 1e-7;
|
||||||
|
|
||||||
|
// for debugging
|
||||||
|
int i_layer = 0;
|
||||||
|
int n_layers = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// result from each iteration
|
||||||
|
struct pca_result {
|
||||||
|
struct ggml_tensor * calculated_square = NULL;
|
||||||
|
std::vector<struct ggml_tensor *> eigenvectors;
|
||||||
|
std::vector<float> distances;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pca_model {
|
||||||
|
ggml_backend_t backend = NULL;
|
||||||
|
ggml_backend_buffer_t buffer;
|
||||||
|
struct ggml_context * ctx; // context to compute graph on target device
|
||||||
|
struct ggml_context * ctx_host; // host context to store results
|
||||||
|
|
||||||
|
// tensors on target device
|
||||||
|
struct ggml_tensor * dev_input;
|
||||||
|
struct ggml_tensor * dev_square;
|
||||||
|
struct ggml_tensor * dev_eigenvector;
|
||||||
|
|
||||||
|
pca_model(struct ggml_tensor * t_input) {
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||||
|
backend = ggml_backend_cuda_init(0); // init device 0
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// TODO: enable Metal support when support for GGML_OP_SQRT is added
|
||||||
|
// #ifdef GGML_USE_METAL
|
||||||
|
// fprintf(stderr, "%s: using Metal backend\n", __func__);
|
||||||
|
// backend = ggml_backend_metal_init();
|
||||||
|
// if (!backend) {
|
||||||
|
// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||||
|
// }
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
|
if (!backend) {
|
||||||
|
backend = ggml_backend_cpu_init();
|
||||||
|
}
|
||||||
|
|
||||||
|
const int num_tensors = 4;
|
||||||
|
struct ggml_init_params params {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx = ggml_init(params);
|
||||||
|
|
||||||
|
auto n_samples = t_input->ne[0];
|
||||||
|
auto n_embd = t_input->ne[1];
|
||||||
|
|
||||||
|
dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
|
||||||
|
dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||||
|
dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
ggml_set_name(dev_input, "dev_input");
|
||||||
|
ggml_set_name(dev_square, "dev_square");
|
||||||
|
ggml_set_name(dev_eigenvector, "dev_eigenvector");
|
||||||
|
buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||||
|
ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
|
||||||
|
|
||||||
|
// initialize eigenvector to random normalized vector
|
||||||
|
{
|
||||||
|
std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
|
||||||
|
std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
|
||||||
|
std::uniform_real_distribution<float> distribution(0.0, 1.0);
|
||||||
|
float sum_sqr = 0.0; // for normalizing random_vec
|
||||||
|
for (size_t i = 0; i < random_vec.size(); ++i) {
|
||||||
|
float f = distribution(generator);
|
||||||
|
sum_sqr += f * f;
|
||||||
|
random_vec[i] = f;
|
||||||
|
}
|
||||||
|
// normalize it
|
||||||
|
float random_vec_norm = std::sqrt(sum_sqr);
|
||||||
|
for (size_t i = 0; i < random_vec.size(); ++i) {
|
||||||
|
random_vec[i] /= random_vec_norm;
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~pca_model() {
|
||||||
|
ggml_free(ctx);
|
||||||
|
ggml_backend_buffer_free(buffer);
|
||||||
|
ggml_backend_free(backend);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct ggml_cgraph * build_graph_piter(
|
||||||
|
const struct pca_params & params,
|
||||||
|
const pca_model & model,
|
||||||
|
bool calc_square = false) {
|
||||||
|
GGML_ASSERT(params.n_batch > 0);
|
||||||
|
// TODO: buf_size must be able to scale with params.n_batch
|
||||||
|
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
|
||||||
|
static std::vector<uint8_t> buf(buf_size);
|
||||||
|
|
||||||
|
struct ggml_init_params params0 = {
|
||||||
|
/*.mem_size =*/ buf_size,
|
||||||
|
/*.mem_buffer =*/ buf.data(),
|
||||||
|
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
|
||||||
|
};
|
||||||
|
// create a temporally context to build the graph
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params0);
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
// turn v_diff_original into square matrix if needed
|
||||||
|
struct ggml_tensor * tmp_square;
|
||||||
|
if (calc_square) {
|
||||||
|
tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
|
||||||
|
ggml_set_name(tmp_square, "tmp_square");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * b_tensor;
|
||||||
|
struct ggml_tensor * distance;
|
||||||
|
struct ggml_tensor * old_eigen = model.dev_eigenvector;
|
||||||
|
struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
|
||||||
|
|
||||||
|
for (int i = 0; i < params.n_batch; ++i) {
|
||||||
|
// b_tensor = square * eigenvector^T
|
||||||
|
b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
|
||||||
|
ggml_set_name(b_tensor, "b_tensor");
|
||||||
|
|
||||||
|
// normalize
|
||||||
|
b_tensor = ggml_div_inplace(ctx0,
|
||||||
|
b_tensor,
|
||||||
|
ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
|
||||||
|
);
|
||||||
|
ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
|
||||||
|
|
||||||
|
// calculate distance(new eigenvector - old eigenvector)
|
||||||
|
// we don't use ggml_sub because it may not be implemented on GPU backend
|
||||||
|
struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
|
||||||
|
distance = ggml_sqrt_inplace(ctx0,
|
||||||
|
ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
|
||||||
|
ggml_format_name(distance, "distance_%d", i);
|
||||||
|
|
||||||
|
old_eigen = b_tensor;
|
||||||
|
|
||||||
|
// build operations nodes
|
||||||
|
ggml_build_forward_expand(gf, distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete the temporally context used to build the graph
|
||||||
|
ggml_free(ctx0);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_status compute_piter(
|
||||||
|
const struct pca_params & params,
|
||||||
|
const pca_model & model,
|
||||||
|
struct ggml_cgraph * gf,
|
||||||
|
ggml_gallocr_t allocr,
|
||||||
|
struct pca_result & result) {
|
||||||
|
// allocate tensors
|
||||||
|
ggml_gallocr_alloc_graph(allocr, gf);
|
||||||
|
|
||||||
|
if (ggml_backend_is_cpu(model.backend)) {
|
||||||
|
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: enable GPU support when support for GGML_OP_SQRT is added
|
||||||
|
//#ifdef GGML_USE_METAL
|
||||||
|
// if (ggml_backend_is_metal(model.backend)) {
|
||||||
|
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
|
||||||
|
// }
|
||||||
|
//#endif
|
||||||
|
|
||||||
|
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
|
||||||
|
if (res == GGML_STATUS_SUCCESS) {
|
||||||
|
auto extract_i = [](std::string prefix, std::string str) -> int {
|
||||||
|
int i = -1;
|
||||||
|
if (str.rfind(prefix, 0) == 0) {
|
||||||
|
sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
};
|
||||||
|
result.calculated_square = NULL;
|
||||||
|
result.eigenvectors.clear();
|
||||||
|
result.distances.clear();
|
||||||
|
result.eigenvectors.resize(params.n_batch);
|
||||||
|
result.distances.resize(params.n_batch);
|
||||||
|
// get output nodes
|
||||||
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||||
|
auto node = gf->nodes[i];
|
||||||
|
int iter = -1;
|
||||||
|
// find b_tensor (without copying data from device)
|
||||||
|
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||||
|
result.eigenvectors[iter] = node;
|
||||||
|
}
|
||||||
|
// find distances, then copy data from device
|
||||||
|
if ((iter = extract_i("distance_", node->name)) > -1) {
|
||||||
|
float d;
|
||||||
|
ggml_backend_tensor_get(node, &d, 0, sizeof(float));
|
||||||
|
result.distances[iter] = d;
|
||||||
|
// std::cout << node->name << " = " << d << "\n";
|
||||||
|
}
|
||||||
|
// find tmp_square if it exists (without copying data from device)
|
||||||
|
if (std::string(node->name) == "tmp_square") {
|
||||||
|
result.calculated_square = node;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void power_iteration(
|
||||||
|
const struct pca_params & params,
|
||||||
|
struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
|
||||||
|
struct ggml_tensor * output) {
|
||||||
|
//printf("in power iteration\n");
|
||||||
|
struct pca_model model(input);
|
||||||
|
|
||||||
|
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
|
||||||
|
struct pca_result result;
|
||||||
|
struct ggml_tensor * last_eigenvector = NULL;
|
||||||
|
|
||||||
|
int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
|
||||||
|
for (int iter = 0; iter < n_iters; ++iter) {
|
||||||
|
bool calc_square = (iter == 0); // only need to calculate square for first iteration
|
||||||
|
struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
|
||||||
|
// ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
|
||||||
|
compute_piter(params, model, gf, allocr, result);
|
||||||
|
|
||||||
|
for (size_t k = 0; k < result.distances.size(); ++k) {
|
||||||
|
last_eigenvector = result.eigenvectors[k];
|
||||||
|
if (result.distances[k] < params.tolerance) {
|
||||||
|
break; // done
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (calc_square) {
|
||||||
|
// copy and store the square matrix if needed
|
||||||
|
GGML_ASSERT(result.calculated_square != NULL);
|
||||||
|
ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// copy last eigen vector and store as input for next iteration
|
||||||
|
GGML_ASSERT(last_eigenvector != NULL);
|
||||||
|
ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
||||||
|
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
// get output tensor
|
||||||
|
GGML_ASSERT(last_eigenvector);
|
||||||
|
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
||||||
|
//print_debug_tensor(output);
|
||||||
|
ggml_gallocr_free(allocr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void run_pca(
|
||||||
|
struct pca_params & params,
|
||||||
|
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
|
||||||
|
const std::vector<struct ggml_tensor *> & v_output) {
|
||||||
|
printf("%s: Running PCA...\n", __func__);
|
||||||
|
for (size_t il = 0; il < v_input.size(); ++il) {
|
||||||
|
|
||||||
|
// prepare output vector
|
||||||
|
struct ggml_tensor * ctrl_out = v_output[il];
|
||||||
|
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
||||||
|
|
||||||
|
// run power_iteration
|
||||||
|
params.i_layer = il;
|
||||||
|
params.n_layers = v_input.size();
|
||||||
|
power_iteration(params, v_input[il], ctrl_out);
|
||||||
|
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
1
examples/cvector-generator/positive.txt
Normal file
1
examples/cvector-generator/positive.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
[INST] Act like a person who is extremely happy. [/INST]
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET embedding)
|
set(TARGET llama-embedding)
|
||||||
add_executable(${TARGET} embedding.cpp)
|
add_executable(${TARGET} embedding.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
### Unix-based systems (Linux, macOS, etc.):
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows:
|
### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
|
@ -63,6 +63,7 @@ int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,9 +80,6 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.random_prompt) {
|
|
||||||
params.prompt = string_random_prompt(rng);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
set(TARGET eval-callback)
|
set(TARGET llama-eval-callback)
|
||||||
add_executable(${TARGET} eval-callback.cpp)
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
set(TEST_TARGET test-eval-callback)
|
set(TEST_TARGET test-eval-callback)
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||||
|
|
|
@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
eval-callback \
|
llama-eval-callback \
|
||||||
--hf-repo ggml-org/models \
|
--hf-repo ggml-org/models \
|
||||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
--model phi-2-q4_0.gguf \
|
--model phi-2-q4_0.gguf \
|
||||||
|
|
|
@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
|
|
||||||
callback_data cb_data;
|
callback_data cb_data;
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.random_prompt) {
|
|
||||||
params.prompt = string_random_prompt(rng);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET export-lora)
|
set(TARGET llama-export-lora)
|
||||||
add_executable(${TARGET} export-lora.cpp)
|
add_executable(${TARGET} export-lora.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
Apply LORA adapters to base model and export the resulting model.
|
Apply LORA adapters to base model and export the resulting model.
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: export-lora [options]
|
usage: llama-export-lora [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -17,7 +17,7 @@ options:
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/export-lora \
|
./bin/llama-export-lora \
|
||||||
-m open-llama-3b-v2-q8_0.gguf \
|
-m open-llama-3b-v2-q8_0.gguf \
|
||||||
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
||||||
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET finetune)
|
set(TARGET llama-finetune)
|
||||||
add_executable(${TARGET} finetune.cpp)
|
add_executable(${TARGET} finetune.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -7,7 +7,7 @@ Basic usage instructions:
|
||||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||||
|
|
||||||
# finetune LORA adapter
|
# finetune LORA adapter
|
||||||
./bin/finetune \
|
./bin/llama-finetune \
|
||||||
--model-base open-llama-3b-v2-q8_0.gguf \
|
--model-base open-llama-3b-v2-q8_0.gguf \
|
||||||
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
||||||
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
||||||
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
--use-checkpointing
|
--use-checkpointing
|
||||||
|
|
||||||
# predict
|
# predict
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||||
|
@ -38,14 +38,14 @@ After 10 more iterations:
|
||||||
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
||||||
|
|
||||||
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
||||||
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
|
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
|
||||||
|
|
||||||
In `main` you can also load multiple LORA adapters, which will then be mixed together.
|
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
|
||||||
|
|
||||||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||||
```
|
```
|
||||||
|
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
|
||||||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue