Changed "llama" to "jarvis"

This commit is contained in:
Caleb P. Nwokocha 2024-10-26 11:10:29 -05:00
parent 4dfbcf9646
commit 52ab617954
372 changed files with 8788 additions and 8788 deletions

View file

@ -7,16 +7,16 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
checkout scm // Clone the repo on Runner checkout scm // Clone the repo on Runner
} }
} }
stage('Compiling llama.cpp'){ stage('Compiling jarvis.cpp'){
sh'''#!/bin/bash sh'''#!/bin/bash
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling jarvis for RISC-V
''' '''
} }
stage('Running llama.cpp'){ stage('Running jarvis.cpp'){
sh'''#!/bin/bash sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./jarvis-cli -m /home/alitariq/codejarvis-7b.Q4_K_M.gguf -p "Anything" -n 9 > jarvis_log.txt # Running jarvis.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results cat jarvis_log.txt # Printing results
''' '''
} }
} }

View file

@ -26,7 +26,7 @@ COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \ fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \ cmake --build build --config Release -j$(nproc) && \
cp build/bin/* . cp build/bin/* .

View file

@ -19,7 +19,7 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc) && \ cmake --build build --config Release -j$(nproc) && \
cp build/bin/* . cp build/bin/* .

View file

@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
FROM ${BASE_ROCM_DEV_CONTAINER} AS build FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs. # This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\ ARG ROCM_DOCKER_ARCH="\
gfx803 \ gfx803 \
@ -41,7 +41,7 @@ ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL # Enable cURL
ENV LLAMA_CURL=1 ENV JARVIS_CURL=1
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev apt-get install -y libcurl4-openssl-dev

View file

@ -15,7 +15,7 @@ WORKDIR /app
COPY . . COPY . .
ENV LLAMA_CURL=1 ENV JARVIS_CURL=1
RUN make -j$(nproc) RUN make -j$(nproc)

View file

@ -23,11 +23,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \ RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \ cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli cmake --build build --config Release --target jarvis-cli
# TODO: use image with NNRT # TODO: use image with NNRT
FROM cosdt/cann:$ASCEND_VERSION AS runtime FROM cosdt/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
@ -41,4 +41,4 @@ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME} ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
ENTRYPOINT ["/llama-cli" ] ENTRYPOINT ["/jarvis-cli" ]

View file

@ -23,7 +23,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \ fi && \
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) cmake --build build --config Release --target jarvis-cli -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
@ -31,7 +31,7 @@ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
COPY --from=build /app/build/bin/llama-cli /llama-cli COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/jarvis-cli" ]

View file

@ -17,12 +17,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "Building with static libs" && \ echo "Building with static libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli cmake --build build --config Release --target jarvis-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/jarvis-cli" ]

View file

@ -16,7 +16,7 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) cmake --build build --config Release --target jarvis-cli -j$(nproc)
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
@ -24,7 +24,7 @@ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
COPY --from=build /app/build/bin/llama-cli /llama-cli COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/jarvis-cli" ]

View file

@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
FROM ${BASE_ROCM_DEV_CONTAINER} AS build FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs. # This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\ ARG ROCM_DOCKER_ARCH="\
gfx803 \ gfx803 \
@ -40,6 +40,6 @@ ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make -j$(nproc) llama-cli RUN make -j$(nproc) jarvis-cli
ENTRYPOINT [ "/app/llama-cli" ] ENTRYPOINT [ "/app/jarvis-cli" ]

View file

@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_VULKAN=1 && \ RUN cmake -B build -DGGML_VULKAN=1 && \
cmake --build build --config Release --target llama-cli cmake --build build --config Release --target jarvis-cli
# Clean up # Clean up
WORKDIR / WORKDIR /
RUN cp /app/build/bin/llama-cli /llama-cli && \ RUN cp /app/build/bin/jarvis-cli /jarvis-cli && \
rm -rf /app rm -rf /app
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/jarvis-cli" ]

View file

@ -9,15 +9,15 @@ WORKDIR /app
COPY . . COPY . .
RUN make -j$(nproc) llama-cli RUN make -j$(nproc) jarvis-cli
FROM ubuntu:$UBUNTU_VERSION AS runtime FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libgomp1 apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli COPY --from=build /app/jarvis-cli /jarvis-cli
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
ENTRYPOINT [ "/llama-cli" ] ENTRYPOINT [ "/jarvis-cli" ]

View file

@ -3,7 +3,7 @@
# Built and maintained by John Boero - boeroboy@gmail.com # Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp: # Notes for jarvis.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically. # 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases. # We need to declare standard versioning if people want to sort latest releases.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies. # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
@ -12,44 +12,44 @@
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support. # It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-cuda Name: jarvis.cpp-cuda
Version: %( date "+%%Y%%m%%d" ) Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist} Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
License: MIT License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz Source0: https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git cuda-toolkit BuildRequires: coreutils make gcc-c++ git cuda-toolkit
Requires: cuda-toolkit Requires: cuda-toolkit
URL: https://github.com/ggerganov/llama.cpp URL: https://github.com/ggerganov/jarvis.cpp
%define debug_package %{nil} %define debug_package %{nil}
%define source_date_epoch_from_changelog 0 %define source_date_epoch_from_changelog 0
%description %description
CPU inference for Meta's Lllama2 models using default options. CPU inference for Meta's Ljarvis2 models using default options.
%prep %prep
%setup -n llama.cpp-master %setup -n jarvis.cpp-master
%build %build
make -j GGML_CUDA=1 make -j GGML_CUDA=1
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cuda-cli
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-cuda-simple
mkdir -p %{buildroot}/usr/lib/systemd/system mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/jarviscuda.service
[Unit] [Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build). Description=Jarvis.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=/etc/sysconfig/llama EnvironmentFile=/etc/sysconfig/jarvis
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS ExecStart=/usr/bin/jarvis-cuda-server $JARVIS_ARGS
ExecReload=/bin/kill -s HUP $MAINPID ExecReload=/bin/kill -s HUP $MAINPID
Restart=never Restart=never
@ -58,8 +58,8 @@ WantedBy=default.target
EOF EOF
mkdir -p %{buildroot}/etc/sysconfig mkdir -p %{buildroot}/etc/sysconfig
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama %{__cat} <<EOF > %{buildroot}/etc/sysconfig/jarvis
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
EOF EOF
%clean %clean
@ -67,11 +67,11 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/* rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama-cuda-cli %{_bindir}/jarvis-cuda-cli
%{_bindir}/llama-cuda-server %{_bindir}/jarvis-cuda-server
%{_bindir}/llama-cuda-simple %{_bindir}/jarvis-cuda-simple
/usr/lib/systemd/system/llamacuda.service /usr/lib/systemd/system/jarviscuda.service
%config /etc/sysconfig/llama %config /etc/sysconfig/jarvis
%pre %pre

View file

@ -3,7 +3,7 @@
# Built and maintained by John Boero - boeroboy@gmail.com # Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp: # Notes for jarvis.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically. # 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases. # We need to declare standard versioning if people want to sort latest releases.
# In the meantime, YYYYMMDD format will be used. # In the meantime, YYYYMMDD format will be used.
@ -13,45 +13,45 @@
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support. # It is up to the user to install the correct vendor-specific support.
Name: llama.cpp Name: jarvis.cpp
Version: %( date "+%%Y%%m%%d" ) Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist} Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
License: MIT License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz Source0: https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git libstdc++-devel BuildRequires: coreutils make gcc-c++ git libstdc++-devel
Requires: libstdc++ Requires: libstdc++
URL: https://github.com/ggerganov/llama.cpp URL: https://github.com/ggerganov/jarvis.cpp
%define debug_package %{nil} %define debug_package %{nil}
%define source_date_epoch_from_changelog 0 %define source_date_epoch_from_changelog 0
%description %description
CPU inference for Meta's Lllama2 models using default options. CPU inference for Meta's Ljarvis2 models using default options.
Models are not included in this package and must be downloaded separately. Models are not included in this package and must be downloaded separately.
%prep %prep
%setup -n llama.cpp-master %setup -n jarvis.cpp-master
%build %build
make -j make -j
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cli
cp -p llama-server %{buildroot}%{_bindir}/llama-server cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-simple
mkdir -p %{buildroot}/usr/lib/systemd/system mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/jarvis.service
[Unit] [Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build). Description=Jarvis.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=/etc/sysconfig/llama EnvironmentFile=/etc/sysconfig/jarvis
ExecStart=/usr/bin/llama-server $LLAMA_ARGS ExecStart=/usr/bin/jarvis-server $JARVIS_ARGS
ExecReload=/bin/kill -s HUP $MAINPID ExecReload=/bin/kill -s HUP $MAINPID
Restart=never Restart=never
@ -60,8 +60,8 @@ WantedBy=default.target
EOF EOF
mkdir -p %{buildroot}/etc/sysconfig mkdir -p %{buildroot}/etc/sysconfig
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama %{__cat} <<EOF > %{buildroot}/etc/sysconfig/jarvis
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
EOF EOF
%clean %clean
@ -69,11 +69,11 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/* rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama-cli %{_bindir}/jarvis-cli
%{_bindir}/llama-server %{_bindir}/jarvis-server
%{_bindir}/llama-simple %{_bindir}/jarvis-simple
/usr/lib/systemd/system/llama.service /usr/lib/systemd/system/jarvis.service
%config /etc/sysconfig/llama %config /etc/sysconfig/jarvis
%pre %pre

View file

@ -22,8 +22,8 @@ COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \ fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc) cmake --build build --config Release --target jarvis-server -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
@ -31,12 +31,12 @@ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
COPY --from=build /app/build/bin/llama-server /llama-server COPY --from=build /app/build/bin/jarvis-server /jarvis-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0 ENV JARVIS_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ] ENTRYPOINT [ "/jarvis-server" ]

View file

@ -15,20 +15,20 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \ fi && \
echo "Building with dynamic libs" && \ echo "Building with dynamic libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DJARVIS_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server cmake --build build --config Release --target jarvis-server
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl apt-get install -y libcurl4-openssl-dev curl
COPY --from=build /app/build/bin/llama-server /llama-server COPY --from=build /app/build/bin/jarvis-server /jarvis-server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0 ENV JARVIS_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ] ENTRYPOINT [ "/jarvis-server" ]

View file

@ -15,8 +15,8 @@ WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc) cmake --build build --config Release --target jarvis-server -j$(nproc)
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
@ -24,12 +24,12 @@ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
COPY --from=build /app/build/bin/llama-server /llama-server COPY --from=build /app/build/bin/jarvis-server /jarvis-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0 ENV JARVIS_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ] ENTRYPOINT [ "/jarvis-server" ]

View file

@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
FROM ${BASE_ROCM_DEV_CONTAINER} AS build FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build. # Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 # List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs. # This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH="\ ARG ROCM_DOCKER_ARCH="\
gfx803 \ gfx803 \
@ -40,15 +40,15 @@ ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0 ENV JARVIS_ARG_HOST=0.0.0.0
# Enable cURL # Enable cURL
ENV LLAMA_CURL=1 ENV JARVIS_CURL=1
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl apt-get install -y libcurl4-openssl-dev curl
RUN make -j$(nproc) llama-server RUN make -j$(nproc) jarvis-server
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ] ENTRYPOINT [ "/app/jarvis-server" ]

View file

@ -14,18 +14,18 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
# Build it # Build it
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ RUN cmake -B build -DGGML_VULKAN=1 -DJARVIS_CURL=1 && \
cmake --build build --config Release --target llama-server cmake --build build --config Release --target jarvis-server
# Clean up # Clean up
WORKDIR / WORKDIR /
RUN cp /app/build/bin/llama-server /llama-server && \ RUN cp /app/build/bin/jarvis-server /jarvis-server && \
rm -rf /app rm -rf /app
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0 ENV JARVIS_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ] ENTRYPOINT [ "/jarvis-server" ]

View file

@ -9,21 +9,21 @@ WORKDIR /app
COPY . . COPY . .
ENV LLAMA_CURL=1 ENV JARVIS_CURL=1
RUN make -j$(nproc) llama-server RUN make -j$(nproc) jarvis-server
FROM ubuntu:$UBUNTU_VERSION AS runtime FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/llama-server /llama-server COPY --from=build /app/jarvis-server /jarvis-server
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine # Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0 ENV JARVIS_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ] ENTRYPOINT [ "/jarvis-server" ]

View file

@ -6,10 +6,10 @@
let let
inherit (config.packages) default; inherit (config.packages) default;
binaries = [ binaries = [
"llama-cli" "jarvis-cli"
"llama-embedding" "jarvis-embedding"
"llama-server" "jarvis-server"
"llama-quantize" "jarvis-quantize"
]; ];
mkApp = name: { mkApp = name: {
type = "app"; type = "app";

View file

@ -2,14 +2,14 @@
lib, lib,
dockerTools, dockerTools,
buildEnv, buildEnv,
llama-cpp, jarvis-cpp,
interactive ? true, interactive ? true,
coreutils, coreutils,
}: }:
# A tar that can be fed into `docker load`: # A tar that can be fed into `docker load`:
# #
# $ nix build .#llamaPackages.docker # $ nix build .#jarvisPackages.docker
# $ docker load < result # $ docker load < result
# For details and variations cf. # For details and variations cf.
@ -19,16 +19,16 @@
# Approximate (compressed) sizes, at the time of writing, are: # Approximate (compressed) sizes, at the time of writing, are:
# #
# .#llamaPackages.docker: 125M; # .#jarvisPackages.docker: 125M;
# .#llamaPackagesCuda.docker: 537M; # .#jarvisPackagesCuda.docker: 537M;
# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M. # .#legacyPackages.aarch64-linux.jarvisPackagesXavier.docker: 415M.
dockerTools.buildLayeredImage { dockerTools.buildLayeredImage {
name = llama-cpp.pname; name = jarvis-cpp.pname;
tag = "latest"; tag = "latest";
contents = contents =
[ llama-cpp ] [ jarvis-cpp ]
++ lib.optionals interactive [ ++ lib.optionals interactive [
coreutils coreutils
dockerTools.binSh dockerTools.binSh

View file

@ -11,10 +11,10 @@
{ {
legacyPackages = legacyPackages =
let let
caps.llamaPackagesXavier = "7.2"; caps.jarvisPackagesXavier = "7.2";
caps.llamaPackagesOrin = "8.7"; caps.jarvisPackagesOrin = "8.7";
caps.llamaPackagesTX2 = "6.2"; caps.jarvisPackagesTX2 = "6.2";
caps.llamaPackagesNano = "5.3"; caps.jarvisPackagesNano = "5.3";
pkgsFor = pkgsFor =
cap: cap:
@ -31,9 +31,9 @@
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps; builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
packages = lib.optionalAttrs (system == "aarch64-linux") { packages = lib.optionalAttrs (system == "aarch64-linux") {
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp; jetson-xavier = config.legacyPackages.jarvisPackagesXavier.jarvis-cpp;
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp; jetson-orin = config.legacyPackages.jarvisPackagesOrin.jarvis-cpp;
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp; jetson-nano = config.legacyPackages.jarvisPackagesNano.jarvis-cpp;
}; };
}; };
} }

View file

@ -1,6 +1,6 @@
{ {
lib, lib,
llamaVersion, jarvisVersion,
numpy, numpy,
tqdm, tqdm,
sentencepiece, sentencepiece,
@ -12,7 +12,7 @@
buildPythonPackage { buildPythonPackage {
pname = "gguf"; pname = "gguf";
version = llamaVersion; version = jarvisVersion;
pyproject = true; pyproject = true;
nativeBuildInputs = [ poetry-core ]; nativeBuildInputs = [ poetry-core ];
propagatedBuildInputs = [ propagatedBuildInputs = [

View file

@ -33,7 +33,7 @@
useRocm ? config.rocmSupport, useRocm ? config.rocmSupport,
enableCurl ? true, enableCurl ? true,
useVulkan ? false, useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake jarvisVersion ? "0.0.0", # Arbitrary version, substituted by the flake
# It's necessary to consistently use backendStdenv when building with CUDA support, # It's necessary to consistently use backendStdenv when building with CUDA support,
# otherwise we get libstdc++ errors downstream. # otherwise we get libstdc++ errors downstream.
@ -103,8 +103,8 @@ let
in in
effectiveStdenv.mkDerivation (finalAttrs: { effectiveStdenv.mkDerivation (finalAttrs: {
pname = "llama-cpp${pnameSuffix}"; pname = "jarvis-cpp${pnameSuffix}";
version = llamaVersion; version = jarvisVersion;
# Note: none of the files discarded here are visible in the sandbox or # Note: none of the files discarded here are visible in the sandbox or
# affect the output hash. This also means they can be modified without # affect the output hash. This also means they can be modified without
@ -132,12 +132,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
''; '';
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, # With PR#6015 https://github.com/ggerganov/jarvis.cpp/pull/6015,
# `default.metallib` may be compiled with Metal compiler from XCode # `default.metallib` may be compiled with Metal compiler from XCode
# and we need to escape sandbox on MacOS to access Metal compiler. # and we need to escape sandbox on MacOS to access Metal compiler.
# `xcrun` is used find the path of the Metal compiler, which is varible # `xcrun` is used find the path of the Metal compiler, which is varible
# and not on $PATH # and not on $PATH
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion # see https://github.com/ggerganov/jarvis.cpp/pull/6118 for discussion
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
nativeBuildInputs = nativeBuildInputs =
@ -166,10 +166,10 @@ effectiveStdenv.mkDerivation (finalAttrs: {
cmakeFlags = cmakeFlags =
[ [
(cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "JARVIS_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl) (cmakeBool "JARVIS_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda) (cmakeBool "GGML_CUDA" useCuda)
@ -205,7 +205,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
# if they haven't been added yet. # if they haven't been added yet.
postInstall = '' postInstall = ''
mkdir -p $out/include mkdir -p $out/include
cp $src/include/llama.h $out/include/ cp $src/include/jarvis.h $out/include/
''; '';
meta = { meta = {
@ -219,11 +219,11 @@ effectiveStdenv.mkDerivation (finalAttrs: {
broken = (useMetalKit && !effectiveStdenv.isDarwin); broken = (useMetalKit && !effectiveStdenv.isDarwin);
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
homepage = "https://github.com/ggerganov/llama.cpp/"; homepage = "https://github.com/ggerganov/jarvis.cpp/";
license = lib.licenses.mit; license = lib.licenses.mit;
# Accommodates `nix run` and `lib.getExe` # Accommodates `nix run` and `lib.getExe`
mainProgram = "llama-cli"; mainProgram = "jarvis-cli";
# These people might respond, on the best effort basis, if you ping them # These people might respond, on the best effort basis, if you ping them
# in case of Nix-specific regressions or for reviewing Nix-specific PRs. # in case of Nix-specific regressions or for reviewing Nix-specific PRs.

View file

@ -9,7 +9,7 @@
}@inputs: }@inputs:
let let
llama-python-deps = with python3Packages; [ jarvis-python-deps = with python3Packages; [
numpy numpy
sentencepiece sentencepiece
transformers transformers
@ -18,7 +18,7 @@ let
gguf-py gguf-py
tqdm tqdm
# for scripts/compare-llama-bench.py # for scripts/compare-jarvis-bench.py
gitpython gitpython
tabulate tabulate
@ -28,7 +28,7 @@ let
]; ];
llama-python-test-deps = with python3Packages; [ jarvis-python-test-deps = with python3Packages; [
# Server bench # Server bench
matplotlib matplotlib
@ -40,7 +40,7 @@ let
in in
buildPythonPackage ({ buildPythonPackage ({
pname = "llama-scripts"; pname = "jarvis-scripts";
version = "0.0.0"; version = "0.0.0";
pyproject = true; pyproject = true;
@ -61,6 +61,6 @@ buildPythonPackage ({
src = lib.cleanSource ../../.; src = lib.cleanSource ../../.;
}; };
nativeBuildInputs = [ poetry-core ]; nativeBuildInputs = [ poetry-core ];
nativeCheckInputs = llama-python-test-deps; nativeCheckInputs = jarvis-python-test-deps;
dependencies = llama-python-deps; dependencies = jarvis-python-deps;
}) })

View file

@ -2,7 +2,7 @@
lib, lib,
newScope, newScope,
python3, python3,
llamaVersion ? "0.0.0", jarvisVersion ? "0.0.0",
}: }:
let let
@ -21,7 +21,7 @@ in
# Cf. https://noogle.dev/f/lib/makeScope # Cf. https://noogle.dev/f/lib/makeScope
lib.makeScope newScope (self: { lib.makeScope newScope (self: {
inherit llamaVersion; inherit jarvisVersion;
gguf-py = self.callPackage ./package-gguf-py.nix { gguf-py = self.callPackage ./package-gguf-py.nix {
inherit inherit
buildPythonPackage buildPythonPackage
@ -34,7 +34,7 @@ lib.makeScope newScope (self: {
; ;
}; };
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; }; python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
llama-cpp = self.callPackage ./package.nix { }; jarvis-cpp = self.callPackage ./package.nix { };
docker = self.callPackage ./docker.nix { }; docker = self.callPackage ./docker.nix { };
docker-min = self.callPackage ./docker.nix { interactive = false; }; docker-min = self.callPackage ./docker.nix { interactive = false; };
sif = self.callPackage ./sif.nix { }; sif = self.callPackage ./sif.nix { };

View file

@ -1,7 +1,7 @@
{ {
lib, lib,
singularity-tools, singularity-tools,
llama-cpp, jarvis-cpp,
bashInteractive, bashInteractive,
interactive ? false, interactive ? false,
}: }:
@ -10,8 +10,8 @@ let
optionalInt = cond: x: if cond then x else 0; optionalInt = cond: x: if cond then x else 0;
in in
singularity-tools.buildImage rec { singularity-tools.buildImage rec {
inherit (llama-cpp) name; inherit (jarvis-cpp) name;
contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ]; contents = [ jarvis-cpp ] ++ lib.optionals interactive [ bashInteractive ];
# These are excessive (but safe) for most variants. Building singularity # These are excessive (but safe) for most variants. Building singularity
# images requires superuser privileges, so we build them inside a VM in a # images requires superuser privileges, so we build them inside a VM in a
@ -22,6 +22,6 @@ singularity-tools.buildImage rec {
# Expected image sizes: # Expected image sizes:
# - cpu/blas: 150M, # - cpu/blas: 150M,
# - cuda, all gencodes: 560M, # - cuda, all gencodes: 560M,
diskSize = 4096 + optionalInt llama-cpp.useRocm 16384; diskSize = 4096 + optionalInt jarvis-cpp.useRocm 16384;
memSize = diskSize; memSize = diskSize;
} }

View file

@ -10,9 +10,9 @@ shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert_hf_to_gguf.py "$@" python3 ./convert_hf_to_gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@" ./jarvis-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@" ./jarvis-cli "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -20,17 +20,17 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}" echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./llama-quantize "$i" "${i/f16/q4_0}" q4_0 ./jarvis-quantize "$i" "${i/f16/q4_0}" q4_0
fi fi
done done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./llama-server "$@" ./jarvis-server "$@"
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml" echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --convert (-c): Convert a llama model into ggml" echo " --convert (-c): Convert a jarvis model into ggml"
echo " ex: --outtype f16 \"/models/7B/\" " echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml" echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"

View file

@ -12,8 +12,8 @@ build*/
models/* models/*
/llama-cli /jarvis-cli
/llama-quantize /jarvis-quantize
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -24,7 +24,7 @@ insert_final_newline = unset
[examples/server/public/*] [examples/server/public/*]
indent_size = 2 indent_size = 2
[examples/llama.swiftui/llama.swiftui.xcodeproj/*] [examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/*]
indent_style = tab indent_style = tab
[examples/cvector-generator/*.txt] [examples/cvector-generator/*.txt]

View file

@ -1,5 +1,5 @@
name: Low Severity Bugs name: Low Severity Bugs
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches) description: Used to report low severity bugs in jarvis.cpp (e.g. cosmetic issues, non critical UI glitches)
title: "Bug: " title: "Bug: "
labels: ["bug-unconfirmed", "low severity"] labels: ["bug-unconfirmed", "low severity"]
body: body:
@ -8,7 +8,7 @@ body:
value: | value: |
Thanks for taking the time to fill out this bug report! Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug, Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using. and the version of jarvis.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug. If possible, please provide a minimal code example that reproduces the bug.
- type: textarea - type: textarea
id: what-happened id: what-happened
@ -24,7 +24,7 @@ body:
label: Name and Version label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string) description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: | placeholder: |
$./llama-cli --version $./jarvis-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations: validations:

View file

@ -1,5 +1,5 @@
name: Medium Severity Bug name: Medium Severity Bug
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable) description: Used to report medium severity bugs in jarvis.cpp (e.g. Malfunctioning Features but generally still useable)
title: "Bug: " title: "Bug: "
labels: ["bug-unconfirmed", "medium severity"] labels: ["bug-unconfirmed", "medium severity"]
body: body:
@ -8,7 +8,7 @@ body:
value: | value: |
Thanks for taking the time to fill out this bug report! Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug, Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using. and the version of jarvis.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug. If possible, please provide a minimal code example that reproduces the bug.
- type: textarea - type: textarea
id: what-happened id: what-happened
@ -24,7 +24,7 @@ body:
label: Name and Version label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string) description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: | placeholder: |
$./llama-cli --version $./jarvis-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations: validations:

View file

@ -1,5 +1,5 @@
name: High Severity Bug name: High Severity Bug
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow) description: Used to report high severity bugs in jarvis.cpp (e.g. Malfunctioning features hindering important common workflow)
title: "Bug: " title: "Bug: "
labels: ["bug-unconfirmed", "high severity"] labels: ["bug-unconfirmed", "high severity"]
body: body:
@ -8,7 +8,7 @@ body:
value: | value: |
Thanks for taking the time to fill out this bug report! Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug, Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using. and the version of jarvis.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug. If possible, please provide a minimal code example that reproduces the bug.
- type: textarea - type: textarea
id: what-happened id: what-happened
@ -24,7 +24,7 @@ body:
label: Name and Version label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string) description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: | placeholder: |
$./llama-cli --version $./jarvis-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations: validations:

View file

@ -1,5 +1,5 @@
name: Critical Severity Bug name: Critical Severity Bug
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss) description: Used to report critical severity bugs in jarvis.cpp (e.g. Crashing, Corrupted, Dataloss)
title: "Bug: " title: "Bug: "
labels: ["bug-unconfirmed", "critical severity"] labels: ["bug-unconfirmed", "critical severity"]
body: body:
@ -8,7 +8,7 @@ body:
value: | value: |
Thanks for taking the time to fill out this bug report! Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug, Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using. and the version of jarvis.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug. If possible, please provide a minimal code example that reproduces the bug.
- type: textarea - type: textarea
id: what-happened id: what-happened
@ -24,7 +24,7 @@ body:
label: Name and Version label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string) description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: | placeholder: |
$./llama-cli --version $./jarvis-cli --version
version: 2999 (42b4109e) version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations: validations:

View file

@ -1,12 +1,12 @@
name: Enhancement name: Enhancement
description: Used to request enhancements for llama.cpp description: Used to request enhancements for jarvis.cpp
title: "Feature Request: " title: "Feature Request: "
labels: ["enhancement"] labels: ["enhancement"]
body: body:
- type: markdown - type: markdown
attributes: attributes:
value: | value: |
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas) [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas)
- type: checkboxes - type: checkboxes
id: prerequisites id: prerequisites
@ -16,18 +16,18 @@ body:
options: options:
- label: I am running the latest code. Mention the version if possible as well. - label: I am running the latest code. Mention the version if possible as well.
required: true required: true
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). - label: I carefully followed the [README.md](https://github.com/ggerganov/jarvis.cpp/blob/master/README.md).
required: true required: true
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed). - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
required: true required: true
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share. - label: I reviewed the [Discussions](https://github.com/ggerganov/jarvis.cpp/discussions), and have a new and useful enhancement to share.
required: true required: true
- type: textarea - type: textarea
id: feature-description id: feature-description
attributes: attributes:
label: Feature Description label: Feature Description
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement. description: Please provide a detailed written description of what you were trying to do, and what you expected `jarvis.cpp` to do as an enhancement.
placeholder: Detailed description of the enhancement placeholder: Detailed description of the enhancement
validations: validations:
required: true required: true
@ -36,7 +36,7 @@ body:
id: motivation id: motivation
attributes: attributes:
label: Motivation label: Motivation
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users. description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `jarvis.cpp` users.
placeholder: Explanation of why this feature is needed and its benefits placeholder: Explanation of why this feature is needed and its benefits
validations: validations:
required: true required: true

View file

@ -6,7 +6,7 @@ body:
- type: markdown - type: markdown
attributes: attributes:
value: | value: |
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
- type: checkboxes - type: checkboxes
id: research-stage id: research-stage

View file

@ -6,8 +6,8 @@ body:
- type: markdown - type: markdown
attributes: attributes:
value: | value: |
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/jarvis.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
- type: textarea - type: textarea
id: background-description id: background-description

View file

@ -1,11 +1,11 @@
blank_issues_enabled: true blank_issues_enabled: true
contact_links: contact_links:
- name: Got an idea? - name: Got an idea?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas
about: Pop it there. It may then become an enhancement ticket. about: Pop it there. It may then become an enhancement ticket.
- name: Got a question? - name: Got a question?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/q-a
about: Ask a question there! about: Ask a question there!
- name: Want to contribute? - name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute url: https://github.com/ggerganov/jarvis.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with about: Head to the contribution guide page of the wiki for areas you can help with

2
.github/labeler.yml vendored
View file

@ -67,7 +67,7 @@ script:
android: android:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- examples/llama.android/** - examples/jarvis.android/**
server: server:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:

View file

@ -1,6 +1,6 @@
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) - [x] I have read the [contributing guidelines](https://github.com/ggerganov/jarvis.cpp/blob/master/CONTRIBUTING.md)
- Self-reported review complexity: - Self-reported review complexity:
- [ ] Low - [ ] Low
- [ ] Medium - [ ] Medium

View file

@ -1,5 +1,5 @@
# TODO: there have been some issues with the workflow, so disabling for now # TODO: there have been some issues with the workflow, so disabling for now
# https://github.com/ggerganov/llama.cpp/issues/7893 # https://github.com/ggerganov/jarvis.cpp/issues/7893
# #
# Benchmark # Benchmark
name: Benchmark name: Benchmark
@ -27,10 +27,10 @@ on:
push: push:
branches: branches:
- master - master
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
pull_request_target: pull_request_target:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
schedule: schedule:
- cron: '04 2 * * *' - cron: '04 2 * * *'
@ -113,16 +113,16 @@ jobs:
set -eux set -eux
cmake -B build \ cmake -B build \
-DGGML_NATIVE=OFF \ -DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \ -DJARVIS_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \ -DJARVIS_CURL=ON \
-DLLAMA_CUBLAS=ON \ -DJARVIS_CUBLAS=ON \
-DCUDAToolkit_ROOT=/usr/local/cuda \ -DCUDAToolkit_ROOT=/usr/local/cuda \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-DCMAKE_CUDA_ARCHITECTURES=75 \ -DCMAKE_CUDA_ARCHITECTURES=75 \
-DLLAMA_FATAL_WARNINGS=OFF \ -DJARVIS_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \ -DJARVIS_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release; -DCMAKE_BUILD_TYPE=Release;
cmake --build build --config Release -j $(nproc) --target llama-server cmake --build build --config Release -j $(nproc) --target jarvis-server
- name: Download the dataset - name: Download the dataset
id: download_dataset id: download_dataset
@ -240,7 +240,7 @@ jobs:
message: | message: |
<p align="center"> <p align="center">
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 📈 **jarvis.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
</p> </p>
@ -249,9 +249,9 @@ jobs:
<summary>Expand details for performance related PR only</summary> <summary>Expand details for performance related PR only</summary>
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.JARVISCPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.JARVISCPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s - Prompt processing (pp): avg=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s - Token generation (tg): avg=${{ env.JARVISCPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_TOKENS_SECOND_P_95_ }}tk/s
- ${{ env.BENCH_GRAPH_XLABEL }} - ${{ env.BENCH_GRAPH_XLABEL }}

View file

@ -28,9 +28,9 @@ env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3 GGML_NLOOP: 3
GGML_N_THREADS: 1 GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1 JARVIS_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1 JARVIS_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1 JARVIS_LOG_TIMESTAMPS: 1
jobs: jobs:
macOS-latest-cmake-arm64: macOS-latest-cmake-arm64:
@ -55,7 +55,7 @@ jobs:
sysctl -a sysctl -a
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -82,14 +82,14 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
cp LICENSE ./build/bin/ cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip name: jarvis-bin-macos-arm64.zip
macOS-latest-cmake-x64: macOS-latest-cmake-x64:
runs-on: macos-12 runs-on: macos-12
@ -112,8 +112,8 @@ jobs:
run: | run: |
sysctl -a sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU: # Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 # https://github.com/ggerganov/jarvis.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake -B build -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -140,20 +140,20 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
cp LICENSE ./build/bin/ cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip name: jarvis-bin-macos-x64.zip
ubuntu-focal-make: ubuntu-focal-make:
runs-on: ubuntu-20.04 runs-on: ubuntu-20.04
env: env:
LLAMA_NODE_AVAILABLE: true JARVIS_NODE_AVAILABLE: true
LLAMA_PYTHON_AVAILABLE: true JARVIS_PYTHON_AVAILABLE: true
steps: steps:
- name: Clone - name: Clone
@ -177,7 +177,7 @@ jobs:
- name: Build - name: Build
id: make_build id: make_build
env: env:
LLAMA_FATAL_WARNINGS: 1 JARVIS_FATAL_WARNINGS: 1
run: | run: |
CC=gcc-8 make -j $(nproc) CC=gcc-8 make -j $(nproc)
@ -204,8 +204,8 @@ jobs:
- name: Build - name: Build
id: make_build id: make_build
env: env:
LLAMA_FATAL_WARNINGS: 1 JARVIS_FATAL_WARNINGS: 1
LLAMA_CURL: 1 JARVIS_CURL: 1
run: | run: |
CC=gcc-8 make -j $(nproc) CC=gcc-8 make -j $(nproc)
@ -230,7 +230,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
- name: Test - name: Test
@ -239,16 +239,16 @@ jobs:
cd build cd build
ctest -L 'main|curl' --verbose --timeout 900 ctest -L 'main|curl' --verbose --timeout 900
- name: Test llama2c conversion - name: Test jarvis2c conversion
id: llama2c_test id: jarvis2c_test
run: | run: |
cd build cd build
echo "Fetch tokenizer" echo "Fetch tokenizer"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model" echo "Fetch jarvis2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf ./bin/jarvis-convert-jarvis2c-to-ggml --copy-vocab-from-model ./tok512.bin --jarvis2c-model stories260K.bin --jarvis2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 ./bin/jarvis-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Determine tag name - name: Determine tag name
id: tag id: tag
@ -268,14 +268,14 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
cp LICENSE ./build/bin/ cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* zip -r jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip path: jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
name: llama-bin-ubuntu-x64.zip name: jarvis-bin-ubuntu-x64.zip
ubuntu-latest-cmake-sanitizer: ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -304,7 +304,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP) - name: Build (no OpenMP)
@ -313,7 +313,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test - name: Test
@ -487,7 +487,7 @@ jobs:
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it. # how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 # ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
macOS-latest-make: macOS-latest-make:
runs-on: macos-latest runs-on: macos-latest
@ -505,7 +505,7 @@ jobs:
- name: Build - name: Build
id: make_build id: make_build
env: env:
LLAMA_FATAL_WARNINGS: 1 JARVIS_FATAL_WARNINGS: 1
run: | run: |
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
@ -517,7 +517,7 @@ jobs:
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it. # how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 # ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
# would be great if we fix these # would be great if we fix these
macOS-latest-cmake: macOS-latest-cmake:
runs-on: macos-latest runs-on: macos-latest
@ -539,7 +539,7 @@ jobs:
sysctl -a sysctl -a
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF .. cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -570,9 +570,9 @@ jobs:
cd build cd build
cmake -G Xcode .. \ cmake -G Xcode .. \
-DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \ -DJARVIS_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \ -DJARVIS_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \ -DJARVIS_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \ -DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
@ -600,9 +600,9 @@ jobs:
cd build cd build
cmake -G Xcode .. \ cmake -G Xcode .. \
-DGGML_METAL_EMBED_LIBRARY=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \ -DJARVIS_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \ -DJARVIS_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \ -DJARVIS_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=tvOS \ -DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
@ -629,7 +629,7 @@ jobs:
- name: xcodebuild for swift package - name: xcodebuild for swift package
id: xcodebuild id: xcodebuild
run: | run: |
xcodebuild -scheme llama -destination "${{ matrix.destination }}" xcodebuild -scheme jarvis -destination "${{ matrix.destination }}"
- name: Build Swift Example - name: Build Swift Example
id: make_build_swift_example id: make_build_swift_example
@ -705,23 +705,23 @@ jobs:
matrix: matrix:
include: include:
- build: 'noavx-x64' - build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2-x64' - build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx-x64' - build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64' - build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'openblas-x64' - build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64' - build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan-x64' - build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64' - build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64' - build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
steps: steps:
- name: Clone - name: Clone
@ -807,7 +807,7 @@ jobs:
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build cd build
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 $env:JARVIS_SKIP_TESTS_SLOW_ON_EMULATOR = 1
& $sde -future -- ctest -L main -C Release --verbose --timeout 900 & $sde -future -- ctest -L main -C Release --verbose --timeout 900
- name: Determine tag name - name: Determine tag name
@ -827,15 +827,15 @@ jobs:
id: pack_artifacts id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt Copy-Item LICENSE .\build\bin\Release\jarvis.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* 7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}.zip name: jarvis-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda: windows-latest-cmake-cuda:
runs-on: windows-2019 runs-on: windows-2019
@ -865,7 +865,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON cmake .. -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
@ -886,28 +886,28 @@ jobs:
id: pack_artifacts id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* 7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip name: jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime - name: Copy and pack Cuda runtime
run: | run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
$dst='.\build\bin\cudart\' $dst='.\build\bin\cudart\'
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* 7z a cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime - name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip path: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip name: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
windows-latest-cmake-sycl: windows-latest-cmake-sycl:
runs-on: windows-latest runs-on: windows-latest
@ -963,14 +963,14 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done" echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* 7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip path: jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip name: jarvis-bin-win-sycl-x64.zip
windows-latest-cmake-hip: windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }} if: ${{ github.event.inputs.create_release != 'true' }}
@ -1060,13 +1060,13 @@ jobs:
- name: Pack artifacts - name: Pack artifacts
id: pack_artifacts id: pack_artifacts
run: | run: |
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* 7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
- name: Upload artifacts - name: Upload artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip path: jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip name: jarvis-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
ios-xcode-build: ios-xcode-build:
runs-on: macos-latest runs-on: macos-latest
@ -1076,7 +1076,7 @@ jobs:
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Build Xcode project - name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build run: xcodebuild -project examples/jarvis.swiftui/jarvis.swiftui.xcodeproj -scheme jarvis.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
android-build: android-build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -1098,7 +1098,7 @@ jobs:
- name: Build - name: Build
run: | run: |
cd examples/llama.android cd examples/jarvis.android
./gradlew build --no-daemon ./gradlew build --no-daemon
@ -1261,7 +1261,7 @@ jobs:
# sudo apt-get install cmake # sudo apt-get install cmake
# #
# - name: Configure # - name: Configure
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON # run: cmake . -DCMAKE_BUILD_TYPE=Debug -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON
# #
# - name: Build # - name: Build
# run: | # run: |
@ -1300,7 +1300,7 @@ jobs:
# - name: Upload binaries # - name: Upload binaries
# uses: actions/upload-artifact@v4 # uses: actions/upload-artifact@v4
# with: # with:
# name: llama-bin-${{ matrix.arch }} # name: jarvis-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }} # path: build/bin/${{ matrix.build }}
# #
# windows-blas: # windows-blas:
@ -1339,7 +1339,7 @@ jobs:
# run: > # run: >
# cmake -S . -B ./build -A ${{ matrix.arch }} # cmake -S . -B ./build -A ${{ matrix.arch }}
# -DCMAKE_BUILD_TYPE=${{ matrix.build }} # -DCMAKE_BUILD_TYPE=${{ matrix.build }}
# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }} # -DJARVIS_SUPPORT_OPENBLAS=${{ matrix.blas }}
# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib" # -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
# #
# - name: Build # - name: Build
@ -1355,7 +1355,7 @@ jobs:
# if: matrix.blas == 'ON' # if: matrix.blas == 'ON'
# uses: actions/upload-artifact@v4 # uses: actions/upload-artifact@v4
# with: # with:
# name: llama-blas-bin-${{ matrix.arch }} # name: jarvis-blas-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }} # path: build/bin/${{ matrix.build }}
# #
# emscripten: # emscripten:

View file

@ -37,21 +37,21 @@ jobs:
strategy: strategy:
matrix: matrix:
config: config:
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light", dockerfile: ".devops/jarvis-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server", dockerfile: ".devops/jarvis-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-cuda", dockerfile: ".devops/jarvis-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/jarvis-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" } - { tag: "light-musa", dockerfile: ".devops/jarvis-cli-musa.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" } - { tag: "server-musa", dockerfile: ".devops/jarvis-server-musa.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" } - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "light-rocm", dockerfile: ".devops/jarvis-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "server-rocm", dockerfile: ".devops/jarvis-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "light-intel", dockerfile: ".devops/jarvis-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "server-intel", dockerfile: ".devops/jarvis-server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v4 uses: actions/checkout@v4

View file

@ -11,7 +11,7 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
repository: "ggerganov/llama.cpp" repository: "ggerganov/jarvis.cpp"
- uses: actions/labeler@v5 - uses: actions/labeler@v5
with: with:
configuration-path: '.github/labeler.yml' configuration-path: '.github/labeler.yml'

View file

@ -47,8 +47,8 @@ jobs:
extra-conf: | extra-conf: |
extra-platforms = aarch64-linux extra-platforms = aarch64-linux
extra-system-features = nixos-test kvm extra-system-features = nixos-test kvm
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2 - uses: DeterminateSystems/magic-nix-cache-action@v2
with: with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -56,7 +56,7 @@ jobs:
uses: cachix/cachix-action@v13 uses: cachix/cachix-action@v13
with: with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: llama-cpp name: jarvis-cpp
- name: Show all output paths - name: Show all output paths
run: > run: >
nix run github:nix-community/nix-eval-jobs nix run github:nix-community/nix-eval-jobs

View file

@ -34,8 +34,8 @@ jobs:
with: with:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: | extra-conf: |
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2 - uses: DeterminateSystems/magic-nix-cache-action@v2
with: with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -61,8 +61,8 @@ jobs:
with: with:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
extra-conf: | extra-conf: |
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@v2 - uses: DeterminateSystems/magic-nix-cache-action@v2
with: with:
upstream-cache: https://${{ matrix.cachixName }}.cachix.org upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -70,7 +70,7 @@ jobs:
uses: cachix/cachix-action@v13 uses: cachix/cachix-action@v13
with: with:
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
name: llama-cpp name: jarvis-cpp
- name: Build - name: Build
run: > run: >
nix run github:Mic92/nix-fast-build nix run github:Mic92/nix-fast-build

View file

@ -21,10 +21,10 @@ on:
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
env: env:
LLAMA_LOG_COLORS: 1 JARVIS_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1 JARVIS_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1 JARVIS_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10 JARVIS_LOG_VERBOSITY: 10
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@ -41,7 +41,7 @@ jobs:
include: include:
- build_type: Release - build_type: Release
sanitizer: "" sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken fail-fast: false # While -DJARVIS_SANITIZE_THREAD=ON is broken
steps: steps:
- name: Dependencies - name: Dependencies
@ -99,12 +99,12 @@ jobs:
run: | run: |
cmake -B build \ cmake -B build \
-DGGML_NATIVE=OFF \ -DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \ -DJARVIS_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \ -DJARVIS_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ; -DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -112,11 +112,11 @@ jobs:
run: | run: |
cmake -B build \ cmake -B build \
-DGGML_NATIVE=OFF \ -DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \ -DJARVIS_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \ -DJARVIS_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
@ -155,8 +155,8 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" cmake -B build -DJARVIS_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target jarvis-server
- name: Python setup - name: Python setup
id: setup_python id: setup_python
@ -180,7 +180,7 @@ jobs:
run: | run: |
cd examples/server/tests cd examples/server/tests
$env:PYTHONIOENCODING = ":replace" $env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags jarvis.cpp
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow

8
.gitignore vendored
View file

@ -48,8 +48,8 @@ build*
!build-info.sh !build-info.sh
!build.zig !build.zig
!docs/build.md !docs/build.md
/libllama.so /libjarvis.so
/llama-* /jarvis-*
/vulkan-shaders-gen /vulkan-shaders-gen
android-ndk-* android-ndk-*
arm_neon.h arm_neon.h
@ -57,7 +57,7 @@ cmake-build-*
CMakeSettings.json CMakeSettings.json
compile_commands.json compile_commands.json
ggml-metal-embed.metal ggml-metal-embed.metal
llama-batched-swift jarvis-batched-swift
/rpc-server /rpc-server
out/ out/
tmp/ tmp/
@ -118,7 +118,7 @@ poetry.toml
/tests/test-double-float /tests/test-double-float
/tests/test-grad0 /tests/test-grad0
/tests/test-grammar-parser /tests/test-grammar-parser
/tests/test-llama-grammar /tests/test-jarvis-grammar
/tests/test-opt /tests/test-opt
/tests/test-quantize-fns /tests/test-quantize-fns
/tests/test-quantize-perf /tests/test-quantize-perf

View file

@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX) project("jarvis.cpp" C CXX)
include(CheckIncludeFileCXX) include(CheckIncludeFileCXX)
#set(CMAKE_WARN_DEPRECATED YES) #set(CMAKE_WARN_DEPRECATED YES)
@ -18,20 +18,20 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(LLAMA_STANDALONE ON) set(JARVIS_STANDALONE ON)
include(git-vars) include(git-vars)
# configure project version # configure project version
# TODO # TODO
else() else()
set(LLAMA_STANDALONE OFF) set(JARVIS_STANDALONE OFF)
endif() endif()
if (EMSCRIPTEN) if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF) set(BUILD_SHARED_LIBS_DEFAULT OFF)
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON) option(JARVIS_WASM_SINGLE_FILE "jarvis: embed WASM inside the generated jarvis.js" ON)
else() else()
if (MINGW) if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF) set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -51,41 +51,41 @@ endif()
# #
# debug # debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) option(JARVIS_ALL_WARNINGS "jarvis: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) option(JARVIS_ALL_WARNINGS_3RD_PARTY "jarvis: enable all compiler warnings in 3rd party libs" OFF)
# build # build
option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) option(JARVIS_FATAL_WARNINGS "jarvis: enable -Werror flag" OFF)
# sanitizers # sanitizers
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) option(JARVIS_SANITIZE_THREAD "jarvis: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) option(JARVIS_SANITIZE_ADDRESS "jarvis: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) option(JARVIS_SANITIZE_UNDEFINED "jarvis: enable undefined sanitizer" OFF)
# utils # utils
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE}) option(JARVIS_BUILD_COMMON "jarvis: build common utils library" ${JARVIS_STANDALONE})
# extra artifacts # extra artifacts
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(JARVIS_BUILD_TESTS "jarvis: build tests" ${JARVIS_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(JARVIS_BUILD_EXAMPLES "jarvis: build examples" ${JARVIS_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) option(JARVIS_BUILD_SERVER "jarvis: build server example" ${JARVIS_STANDALONE})
# 3rd party libs # 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(JARVIS_CURL "jarvis: use libcurl to download model from an URL" OFF)
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
# override ggml options # override ggml options
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) set(GGML_SANITIZE_THREAD ${JARVIS_SANITIZE_THREAD})
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) set(GGML_SANITIZE_ADDRESS ${JARVIS_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) set(GGML_SANITIZE_UNDEFINED ${JARVIS_SANITIZE_UNDEFINED})
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_ALL_WARNINGS ${JARVIS_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) set(GGML_FATAL_WARNINGS ${JARVIS_FATAL_WARNINGS})
# change the default for these ggml options # change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE) if (NOT DEFINED GGML_JARVISFILE)
set(GGML_LLAMAFILE_DEFAULT ON) set(GGML_JARVISFILE_DEFAULT ON)
endif() endif()
if (NOT DEFINED GGML_AMX) if (NOT DEFINED GGML_AMX)
@ -97,23 +97,23 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
endif() endif()
# transition helpers # transition helpers
function (llama_option_depr TYPE OLD NEW) function (jarvis_option_depr TYPE OLD NEW)
if (${OLD}) if (${OLD})
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
set(${NEW} ON PARENT_SCOPE) set(${NEW} ON PARENT_SCOPE)
endif() endif()
endfunction() endfunction()
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA) jarvis_option_depr(FATAL_ERROR JARVIS_CUBLAS GGML_CUDA)
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA) jarvis_option_depr(WARNING JARVIS_CUDA GGML_CUDA)
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE) jarvis_option_depr(WARNING JARVIS_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL) jarvis_option_depr(WARNING JARVIS_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) jarvis_option_depr(WARNING JARVIS_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) jarvis_option_depr(WARNING JARVIS_NATIVE GGML_NATIVE)
llama_option_depr(WARNING LLAMA_RPC GGML_RPC) jarvis_option_depr(WARNING JARVIS_RPC GGML_RPC)
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) jarvis_option_depr(WARNING JARVIS_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) jarvis_option_depr(WARNING JARVIS_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN) jarvis_option_depr(WARNING JARVIS_CANN GGML_CANN)
# #
# build the library # build the library
@ -132,18 +132,18 @@ add_subdirectory(src)
include(GNUInstallDirs) include(GNUInstallDirs)
include(CMakePackageConfigHelpers) include(CMakePackageConfigHelpers)
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) set(JARVIS_BUILD_NUMBER ${BUILD_NUMBER})
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) set(JARVIS_BUILD_COMMIT ${BUILD_COMMIT})
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) set(JARVIS_INSTALL_VERSION 0.0.${BUILD_NUMBER})
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") set(JARVIS_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(JARVIS_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") set(JARVIS_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
# At the moment some compile definitions are placed within the ggml/src # At the moment some compile definitions are placed within the ggml/src
# directory but not exported on the `ggml` target. This could be improved by # directory but not exported on the `ggml` target. This could be improved by
# determining _precisely_ which defines are necessary for the llama-config # determining _precisely_ which defines are necessary for the jarvis-config
# package. # package.
# #
set(GGML_TRANSIENT_DEFINES) set(GGML_TRANSIENT_DEFINES)
@ -158,25 +158,25 @@ if (GGML_TARGET_DEFINES)
endif() endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) set_target_properties(jarvis PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/jarvis.h)
install(TARGETS llama LIBRARY PUBLIC_HEADER) install(TARGETS jarvis LIBRARY PUBLIC_HEADER)
configure_package_config_file( configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in ${CMAKE_CURRENT_SOURCE_DIR}/cmake/jarvis-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR PATH_VARS JARVIS_INCLUDE_INSTALL_DIR
LLAMA_LIB_INSTALL_DIR JARVIS_LIB_INSTALL_DIR
LLAMA_BIN_INSTALL_DIR ) JARVIS_BIN_INSTALL_DIR )
write_basic_package_version_file( write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake ${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
VERSION ${LLAMA_INSTALL_VERSION} VERSION ${JARVIS_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion) COMPATIBILITY SameMajorVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake ${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama) DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis)
install( install(
FILES convert_hf_to_gguf.py FILES convert_hf_to_gguf.py
@ -190,27 +190,27 @@ install(
WORLD_EXECUTE WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR}) DESTINATION ${CMAKE_INSTALL_BINDIR})
configure_file(cmake/llama.pc.in configure_file(cmake/jarvis.pc.in
"${CMAKE_CURRENT_BINARY_DIR}/llama.pc" "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
@ONLY) @ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" install(FILES "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
DESTINATION lib/pkgconfig) DESTINATION lib/pkgconfig)
# #
# utils, programs, examples and tests # utils, programs, examples and tests
# #
if (LLAMA_BUILD_COMMON) if (JARVIS_BUILD_COMMON)
add_subdirectory(common) add_subdirectory(common)
endif() endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest) include(CTest)
add_subdirectory(tests) add_subdirectory(tests)
endif() endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_EXAMPLES)
add_subdirectory(examples) add_subdirectory(examples)
add_subdirectory(pocs) add_subdirectory(pocs)
endif() endif()

View file

@ -11,7 +11,7 @@
- Squash-merge PRs - Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)` - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules - Optionally pick a `<module>` from here: https://github.com/ggerganov/jarvis.cpp/wiki/Modules
# Coding guidelines # Coding guidelines
@ -22,7 +22,7 @@
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/jarvis.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png) ![matmul](media/matmul.png)
@ -30,4 +30,4 @@
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
https://github.com/ggerganov/llama.cpp/projects https://github.com/ggerganov/jarvis.cpp/projects

View file

@ -7,7 +7,7 @@ import java.util.Scanner;
public class LLMCLI { public class LLMCLI {
public static void main(String[] args) { public static void main(String[] args) {
// Path to the .exe file // Path to the .exe file
String exePath = "bin/llama-cli.exe"; String exePath = "bin/jarvis-cli.exe";
System.out.println("Enter -h for help"); System.out.println("Enter -h for help");
// Scanner to take user input for various commands // Scanner to take user input for various commands

388
Makefile
View file

@ -1,44 +1,44 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = \ BUILD_TARGETS = \
libllava.a \ libllava.a \
llama-baby-llama \ jarvis-baby-jarvis \
llama-batched \ jarvis-batched \
llama-batched-bench \ jarvis-batched-bench \
llama-bench \ jarvis-bench \
llama-cli \ jarvis-cli \
llama-convert-llama2c-to-ggml \ jarvis-convert-jarvis2c-to-ggml \
llama-embedding \ jarvis-embedding \
llama-eval-callback \ jarvis-eval-callback \
llama-export-lora \ jarvis-export-lora \
llama-gbnf-validator \ jarvis-gbnf-validator \
llama-gguf \ jarvis-gguf \
llama-gguf-hash \ jarvis-gguf-hash \
llama-gguf-split \ jarvis-gguf-split \
llama-gritlm \ jarvis-gritlm \
llama-imatrix \ jarvis-imatrix \
llama-infill \ jarvis-infill \
llama-llava-cli \ jarvis-llava-cli \
llama-minicpmv-cli\ jarvis-minicpmv-cli\
llama-lookahead \ jarvis-lookahead \
llama-lookup \ jarvis-lookup \
llama-lookup-create \ jarvis-lookup-create \
llama-lookup-merge \ jarvis-lookup-merge \
llama-lookup-stats \ jarvis-lookup-stats \
llama-parallel \ jarvis-parallel \
llama-passkey \ jarvis-passkey \
llama-perplexity \ jarvis-perplexity \
llama-q8dot \ jarvis-q8dot \
llama-quantize \ jarvis-quantize \
llama-quantize-stats \ jarvis-quantize-stats \
llama-retrieval \ jarvis-retrieval \
llama-save-load-state \ jarvis-save-load-state \
llama-server \ jarvis-server \
llama-simple \ jarvis-simple \
llama-speculative \ jarvis-speculative \
llama-tokenize \ jarvis-tokenize \
llama-vdot \ jarvis-vdot \
llama-cvector-generator \ jarvis-cvector-generator \
llama-gen-docs \ jarvis-gen-docs \
tests/test-c.o tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
@ -52,7 +52,7 @@ TEST_TARGETS = \
tests/test-grammar-integration \ tests/test-grammar-integration \
tests/test-grammar-parser \ tests/test-grammar-parser \
tests/test-json-schema-to-grammar \ tests/test-json-schema-to-grammar \
tests/test-llama-grammar \ tests/test-jarvis-grammar \
tests/test-log \ tests/test-log \
tests/test-model-load-cancel \ tests/test-model-load-cancel \
tests/test-opt \ tests/test-opt \
@ -65,8 +65,8 @@ TEST_TARGETS = \
tests/test-tokenizer-1-spm tests/test-tokenizer-1-spm
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-jarvis2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback jarvis-bench libllava.a llava-cli baby-jarvis \
retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
@ -74,80 +74,80 @@ LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
# Deprecation aliases # Deprecation aliases
ifdef LLAMA_CUBLAS ifdef JARVIS_CUBLAS
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.) $(error JARVIS_CUBLAS is removed. Use GGML_CUDA instead.)
endif endif
ifdef LLAMA_CUDA ifdef JARVIS_CUDA
GGML_CUDA := 1 GGML_CUDA := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_KOMPUTE ifdef JARVIS_KOMPUTE
GGML_KOMPUTE := 1 GGML_KOMPUTE := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_METAL ifdef JARVIS_METAL
GGML_METAL := 1 GGML_METAL := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_RPC ifdef JARVIS_RPC
GGML_RPC := 1 GGML_RPC := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_SYCL ifdef JARVIS_SYCL
GGML_SYCL := 1 GGML_SYCL := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_SYCL_F16 ifdef JARVIS_SYCL_F16
GGML_SYCL_F16 := 1 GGML_SYCL_F16 := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_OPENBLAS ifdef JARVIS_OPENBLAS
GGML_OPENBLAS := 1 GGML_OPENBLAS := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_OPENBLAS64 ifdef JARVIS_OPENBLAS64
GGML_OPENBLAS64 := 1 GGML_OPENBLAS64 := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_BLIS ifdef JARVIS_BLIS
GGML_BLIS := 1 GGML_BLIS := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_NO_LLAMAFILE ifdef JARVIS_NO_JARVISFILE
GGML_NO_LLAMAFILE := 1 GGML_NO_JARVISFILE := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_NO_ACCELERATE ifdef JARVIS_NO_ACCELERATE
GGML_NO_ACCELERATE := 1 GGML_NO_ACCELERATE := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_NO_OPENMP ifdef JARVIS_NO_OPENMP
GGML_NO_OPENMP := 1 GGML_NO_OPENMP := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_NO_METAL ifdef JARVIS_NO_METAL
GGML_NO_METAL := 1 GGML_NO_METAL := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
ifdef LLAMA_DISABLE_LOGS ifdef JARVIS_DISABLE_LOGS
REMOVE_WARNING := 1 REMOVE_WARNING := 1
endif endif
ifdef LLAMA_SERVER_VERBOSE ifdef JARVIS_SERVER_VERBOSE
REMOVE_WARNING := 1 REMOVE_WARNING := 1
endif endif
@ -211,8 +211,8 @@ test: $(TEST_TARGETS)
@failures=0; \ @failures=0; \
for test_target in $(TEST_TARGETS); do \ for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-spm.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
@ -257,7 +257,7 @@ MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11 MK_NVCCFLAGS = -std=c++11
ifdef LLAMA_NO_CCACHE ifdef JARVIS_NO_CCACHE
GGML_NO_CCACHE := 1 GGML_NO_CCACHE := 1
DEPRECATE_WARNING := 1 DEPRECATE_WARNING := 1
endif endif
@ -320,7 +320,7 @@ ifdef GGML_SCHED_MAX_COPIES
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES) MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
endif endif
ifdef LLAMA_DEBUG ifdef JARVIS_DEBUG
MK_CFLAGS += -O0 -g MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g
MK_LDFLAGS += -g MK_LDFLAGS += -g
@ -336,25 +336,25 @@ else
MK_NVCCFLAGS += -O3 -g MK_NVCCFLAGS += -O3 -g
endif endif
ifdef LLAMA_SANITIZE_THREAD ifdef JARVIS_SANITIZE_THREAD
MK_CFLAGS += -fsanitize=thread -g MK_CFLAGS += -fsanitize=thread -g
MK_CXXFLAGS += -fsanitize=thread -g MK_CXXFLAGS += -fsanitize=thread -g
MK_LDFLAGS += -fsanitize=thread -g MK_LDFLAGS += -fsanitize=thread -g
endif endif
ifdef LLAMA_SANITIZE_ADDRESS ifdef JARVIS_SANITIZE_ADDRESS
MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
endif endif
ifdef LLAMA_SANITIZE_UNDEFINED ifdef JARVIS_SANITIZE_UNDEFINED
MK_CFLAGS += -fsanitize=undefined -g MK_CFLAGS += -fsanitize=undefined -g
MK_CXXFLAGS += -fsanitize=undefined -g MK_CXXFLAGS += -fsanitize=undefined -g
MK_LDFLAGS += -fsanitize=undefined -g MK_LDFLAGS += -fsanitize=undefined -g
endif endif
ifdef LLAMA_SERVER_SSL ifdef JARVIS_SERVER_SSL
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
MK_LDFLAGS += -lssl -lcrypto MK_LDFLAGS += -lssl -lcrypto
endif endif
@ -381,7 +381,7 @@ MK_CXXFLAGS += \
-Wmissing-declarations \ -Wmissing-declarations \
-Wmissing-noreturn -Wmissing-noreturn
ifeq ($(LLAMA_FATAL_WARNINGS),1) ifeq ($(JARVIS_FATAL_WARNINGS),1)
MK_CFLAGS += -Werror MK_CFLAGS += -Werror
MK_CXXFLAGS += -Werror MK_CXXFLAGS += -Werror
endif endif
@ -420,7 +420,7 @@ ifeq ($(_WIN32),1)
LWINSOCK2 := -lws2_32 LWINSOCK2 := -lws2_32
endif endif
ifdef LLAMA_GPROF ifdef JARVIS_GPROF
MK_CFLAGS += -pg MK_CFLAGS += -pg
MK_CXXFLAGS += -pg MK_CXXFLAGS += -pg
endif endif
@ -448,7 +448,7 @@ endif
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922 # https://github.com/ggerganov/jarvis.cpp/issues/2922
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
@ -574,9 +574,9 @@ ifdef GGML_NVPL
OBJ_GGML += ggml/src/ggml-blas.o OBJ_GGML += ggml/src/ggml-blas.o
endif # GGML_NVPL endif # GGML_NVPL
ifndef GGML_NO_LLAMAFILE ifndef GGML_NO_JARVISFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_JARVISFILE
OBJ_GGML += ggml/src/llamafile/sgemm.o OBJ_GGML += ggml/src/jarvisfile/sgemm.o
endif endif
ifndef GGML_NO_AMX ifndef GGML_NO_AMX
@ -627,9 +627,9 @@ ifdef GGML_CUDA
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML += $(OBJ_CUDA_TMPL) OBJ_GGML += $(OBJ_CUDA_TMPL)
ifdef LLAMA_FATAL_WARNINGS ifdef JARVIS_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings MK_NVCCFLAGS += -Werror all-warnings
endif # LLAMA_FATAL_WARNINGS endif # JARVIS_FATAL_WARNINGS
ifndef GGML_MUSA ifndef GGML_MUSA
ifndef JETSON_EOL_MODULE_DETECT ifndef JETSON_EOL_MODULE_DETECT
@ -637,9 +637,9 @@ ifndef JETSON_EOL_MODULE_DETECT
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
endif # GGML_MUSA endif # GGML_MUSA
ifdef LLAMA_DEBUG ifdef JARVIS_DEBUG
MK_NVCCFLAGS += -lineinfo MK_NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG endif # JARVIS_DEBUG
ifdef GGML_CUDA_DEBUG ifdef GGML_CUDA_DEBUG
MK_NVCCFLAGS += --device-debug MK_NVCCFLAGS += --device-debug
@ -920,11 +920,11 @@ OBJ_GGML += \
ggml/src/ggml-quants.o \ ggml/src/ggml-quants.o \
ggml/src/ggml-aarch64.o ggml/src/ggml-aarch64.o
OBJ_LLAMA = \ OBJ_JARVIS = \
src/llama.o \ src/jarvis.o \
src/llama-vocab.o \ src/jarvis-vocab.o \
src/llama-grammar.o \ src/jarvis-grammar.o \
src/llama-sampling.o \ src/jarvis-sampling.o \
src/unicode.o \ src/unicode.o \
src/unicode-data.o src/unicode-data.o
@ -939,19 +939,19 @@ OBJ_COMMON = \
common/build-info.o \ common/build-info.o \
common/json-schema-to-grammar.o common/json-schema-to-grammar.o
OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) OBJ_ALL = $(OBJ_GGML) $(OBJ_JARVIS) $(OBJ_COMMON)
LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT) LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT)
LIB_GGML_S = $(LIB_PRE)ggml.a LIB_GGML_S = $(LIB_PRE)ggml.a
LIB_LLAMA = $(LIB_PRE)llama$(DSO_EXT) LIB_JARVIS = $(LIB_PRE)jarvis$(DSO_EXT)
LIB_LLAMA_S = $(LIB_PRE)llama.a LIB_JARVIS_S = $(LIB_PRE)jarvis.a
LIB_COMMON = $(LIB_PRE)common$(DSO_EXT) LIB_COMMON = $(LIB_PRE)common$(DSO_EXT)
LIB_COMMON_S = $(LIB_PRE)common.a LIB_COMMON_S = $(LIB_PRE)common.a
LIB_ALL = $(LIB_GGML) $(LIB_LLAMA) $(LIB_COMMON) LIB_ALL = $(LIB_GGML) $(LIB_JARVIS) $(LIB_COMMON)
LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S) LIB_ALL_S = $(LIB_GGML_S) $(LIB_JARVIS_S) $(LIB_COMMON_S)
GF_CC := $(CC) GF_CC := $(CC)
include scripts/get-flags.mk include scripts/get-flags.mk
@ -971,8 +971,8 @@ include scripts/get-flags.mk
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
endif endif
ifdef LLAMA_CURL ifdef JARVIS_CURL
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL override CXXFLAGS := $(CXXFLAGS) -DJARVIS_USE_CURL
override LDFLAGS := $(LDFLAGS) -lcurl override LDFLAGS := $(LDFLAGS) -lcurl
endif endif
@ -980,7 +980,7 @@ endif
# Print build information # Print build information
# #
$(info I llama.cpp build info: ) $(info I jarvis.cpp build info: )
$(info I UNAME_S: $(UNAME_S)) $(info I UNAME_S: $(UNAME_S))
$(info I UNAME_P: $(UNAME_P)) $(info I UNAME_P: $(UNAME_P))
$(info I UNAME_M: $(UNAME_M)) $(info I UNAME_M: $(UNAME_M))
@ -1009,30 +1009,30 @@ $(info )
ifdef DEPRECATE_WARNING ifdef DEPRECATE_WARNING
$(info !!! DEPRECATION WARNING !!!) $(info !!! DEPRECATION WARNING !!!)
$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead) $(info The following JARVIS_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
$(info - LLAMA_CUDA) $(info - JARVIS_CUDA)
$(info - LLAMA_METAL) $(info - JARVIS_METAL)
$(info - LLAMA_METAL_EMBED_LIBRARY) $(info - JARVIS_METAL_EMBED_LIBRARY)
$(info - LLAMA_OPENMP) $(info - JARVIS_OPENMP)
$(info - LLAMA_RPC) $(info - JARVIS_RPC)
$(info - LLAMA_SYCL) $(info - JARVIS_SYCL)
$(info - LLAMA_SYCL_F16) $(info - JARVIS_SYCL_F16)
$(info - LLAMA_OPENBLAS) $(info - JARVIS_OPENBLAS)
$(info - LLAMA_OPENBLAS64) $(info - JARVIS_OPENBLAS64)
$(info - LLAMA_BLIS) $(info - JARVIS_BLIS)
$(info - LLAMA_NO_LLAMAFILE) $(info - JARVIS_NO_JARVISFILE)
$(info - LLAMA_NO_ACCELERATE) $(info - JARVIS_NO_ACCELERATE)
$(info - LLAMA_NO_OPENMP) $(info - JARVIS_NO_OPENMP)
$(info - LLAMA_NO_METAL) $(info - JARVIS_NO_METAL)
$(info - LLAMA_NO_CCACHE) $(info - JARVIS_NO_CCACHE)
$(info ) $(info )
endif endif
ifdef REMOVE_WARNING ifdef REMOVE_WARNING
$(info !!! REMOVAL WARNING !!!) $(info !!! REMOVAL WARNING !!!)
$(info The following LLAMA_ options have been removed and are no longer supported) $(info The following JARVIS_ options have been removed and are no longer supported)
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418)) $(info - JARVIS_DISABLE_LOGS (https://github.com/ggerganov/jarvis.cpp/pull/9418))
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418)) $(info - JARVIS_SERVER_VERBOSE (https://github.com/ggerganov/jarvis.cpp/pull/9418))
$(info ) $(info )
endif endif
@ -1079,13 +1079,13 @@ ggml/src/ggml-blas.o: \
ggml/include/ggml-blas.h ggml/include/ggml-blas.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
ifndef GGML_NO_LLAMAFILE ifndef GGML_NO_JARVISFILE
ggml/src/llamafile/sgemm.o: \ ggml/src/jarvisfile/sgemm.o: \
ggml/src/llamafile/sgemm.cpp \ ggml/src/jarvisfile/sgemm.cpp \
ggml/src/llamafile/sgemm.h \ ggml/src/jarvisfile/sgemm.h \
ggml/include/ggml.h ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE endif # GGML_NO_JARVISFILE
ifndef GGML_NO_AMX ifndef GGML_NO_AMX
ggml/src/ggml-amx.o: \ ggml/src/ggml-amx.o: \
@ -1115,7 +1115,7 @@ $(LIB_GGML_S): \
$(OBJ_GGML) $(OBJ_GGML)
ar rcs $(LIB_GGML_S) $^ ar rcs $(LIB_GGML_S) $^
# llama # jarvis
src/unicode.o: \ src/unicode.o: \
src/unicode.cpp \ src/unicode.cpp \
@ -1127,14 +1127,14 @@ src/unicode-data.o: \
src/unicode-data.h src/unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
src/llama.o: \ src/jarvis.o: \
src/llama.cpp \ src/jarvis.cpp \
src/llama-impl.h \ src/jarvis-impl.h \
src/llama-vocab.h \ src/jarvis-vocab.h \
src/llama-grammar.h \ src/jarvis-grammar.h \
src/llama-sampling.h \ src/jarvis-sampling.h \
src/unicode.h \ src/unicode.h \
include/llama.h \ include/jarvis.h \
ggml/include/ggml-cuda.h \ ggml/include/ggml-cuda.h \
ggml/include/ggml-metal.h \ ggml/include/ggml-metal.h \
ggml/include/ggml.h \ ggml/include/ggml.h \
@ -1142,37 +1142,37 @@ src/llama.o: \
ggml/include/ggml-backend.h ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-vocab.o: \ src/jarvis-vocab.o: \
src/llama-vocab.cpp \ src/jarvis-vocab.cpp \
src/llama-vocab.h \ src/jarvis-vocab.h \
src/llama-impl.h \ src/jarvis-impl.h \
include/llama.h include/jarvis.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-grammar.o: \ src/jarvis-grammar.o: \
src/llama-grammar.cpp \ src/jarvis-grammar.cpp \
src/llama-grammar.h \ src/jarvis-grammar.h \
src/llama-impl.h \ src/jarvis-impl.h \
src/llama-vocab.h \ src/jarvis-vocab.h \
src/llama-sampling.h \ src/jarvis-sampling.h \
include/llama.h include/jarvis.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-sampling.o: \ src/jarvis-sampling.o: \
src/llama-sampling.cpp \ src/jarvis-sampling.cpp \
src/llama-sampling.h \ src/jarvis-sampling.h \
src/llama-impl.h \ src/jarvis-impl.h \
include/llama.h include/jarvis.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
$(LIB_LLAMA): \ $(LIB_JARVIS): \
$(OBJ_LLAMA) \ $(OBJ_JARVIS) \
$(LIB_GGML) $(LIB_GGML)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
$(LIB_LLAMA_S): \ $(LIB_JARVIS_S): \
$(OBJ_LLAMA) $(OBJ_JARVIS)
ar rcs $(LIB_LLAMA_S) $^ ar rcs $(LIB_JARVIS_S) $^
# common # common
@ -1183,7 +1183,7 @@ common/common.o: \
common/sampling.h \ common/sampling.h \
common/json.hpp \ common/json.hpp \
common/json-schema-to-grammar.h \ common/json-schema-to-grammar.h \
include/llama.h include/jarvis.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common/arg.o: \ common/arg.o: \
@ -1199,7 +1199,7 @@ common/log.o: \
common/sampling.o: \ common/sampling.o: \
common/sampling.cpp \ common/sampling.cpp \
common/sampling.h \ common/sampling.h \
include/llama.h include/jarvis.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common/console.o: \ common/console.o: \
@ -1224,7 +1224,7 @@ common/ngram-cache.o: \
$(LIB_COMMON): \ $(LIB_COMMON): \
$(OBJ_COMMON) \ $(OBJ_COMMON) \
$(LIB_LLAMA) \ $(LIB_JARVIS) \
$(LIB_GGML) $(LIB_GGML)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
@ -1246,7 +1246,7 @@ clean:
rm -rvf ggml/*.dll rm -rvf ggml/*.dll
rm -rvf ggml/*.so rm -rvf ggml/*.so
rm -vrf ggml/src/*.o rm -vrf ggml/src/*.o
rm -rvf ggml/src/llamafile/*.o rm -rvf ggml/src/jarvisfile/*.o
rm -rvf common/build-info.cpp rm -rvf common/build-info.cpp
rm -vrf ggml/src/ggml-metal-embed.metal rm -vrf ggml/src/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-cuda/*.o rm -vrf ggml/src/ggml-cuda/*.o
@ -1269,75 +1269,75 @@ clean:
# Helper function that replaces .c, .cpp, and .cu file endings with .o: # Helper function that replaces .c, .cpp, and .cu file endings with .o:
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
llama-cli: examples/main/main.cpp \ jarvis-cli: examples/main/main.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo @echo
@echo '==== Run ./llama-cli -h for help. ====' @echo '==== Run ./jarvis-cli -h for help. ===='
@echo @echo
llama-infill: examples/infill/infill.cpp \ jarvis-infill: examples/infill/infill.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-simple: examples/simple/simple.cpp \ jarvis-simple: examples/simple/simple.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-tokenize: examples/tokenize/tokenize.cpp \ jarvis-tokenize: examples/tokenize/tokenize.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-batched: examples/batched/batched.cpp \ jarvis-batched: examples/batched/batched.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-batched-bench: examples/batched-bench/batched-bench.cpp \ jarvis-batched-bench: examples/batched-bench/batched-bench.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-quantize: examples/quantize/quantize.cpp \ jarvis-quantize: examples/quantize/quantize.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \ jarvis-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-perplexity: examples/perplexity/perplexity.cpp \ jarvis-perplexity: examples/perplexity/perplexity.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-imatrix: examples/imatrix/imatrix.cpp \ jarvis-imatrix: examples/imatrix/imatrix.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-embedding: examples/embedding/embedding.cpp \ jarvis-embedding: examples/embedding/embedding.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gritlm: examples/gritlm/gritlm.cpp \ jarvis-gritlm: examples/gritlm/gritlm.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-save-load-state: examples/save-load-state/save-load-state.cpp \ jarvis-save-load-state: examples/save-load-state/save-load-state.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gguf: examples/gguf/gguf.cpp \ jarvis-gguf: examples/gguf/gguf.cpp \
$(OBJ_GGML) $(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1354,92 +1354,92 @@ examples/gguf-hash/deps/sha256/sha256.o: \
examples/gguf-hash/deps/sha256/sha256.c examples/gguf-hash/deps/sha256/sha256.c
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@ $(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\ jarvis-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gguf-split: examples/gguf-split/gguf-split.cpp \ jarvis-gguf-split: examples/gguf-split/gguf-split.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-eval-callback: examples/eval-callback/eval-callback.cpp \ jarvis-eval-callback: examples/eval-callback/eval-callback.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ jarvis-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ jarvis-convert-jarvis2c-to-ggml: examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp \ jarvis-bench: examples/jarvis-bench/jarvis-bench.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-baby-llama: examples/baby-llama/baby-llama.cpp \ jarvis-baby-jarvis: examples/baby-jarvis/baby-jarvis.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-export-lora: examples/export-lora/export-lora.cpp \ jarvis-export-lora: examples/export-lora/export-lora.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-retrieval: examples/retrieval/retrieval.cpp \ jarvis-retrieval: examples/retrieval/retrieval.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-speculative: examples/speculative/speculative.cpp \ jarvis-speculative: examples/speculative/speculative.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-parallel: examples/parallel/parallel.cpp \ jarvis-parallel: examples/parallel/parallel.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookahead: examples/lookahead/lookahead.cpp \ jarvis-lookahead: examples/lookahead/lookahead.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup: examples/lookup/lookup.cpp \ jarvis-lookup: examples/lookup/lookup.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-create: examples/lookup/lookup-create.cpp \ jarvis-lookup-create: examples/lookup/lookup-create.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-merge: examples/lookup/lookup-merge.cpp \ jarvis-lookup-merge: examples/lookup/lookup-merge.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-lookup-stats: examples/lookup/lookup-stats.cpp \ jarvis-lookup-stats: examples/lookup/lookup-stats.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-passkey: examples/passkey/passkey.cpp \ jarvis-passkey: examples/passkey/passkey.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \ jarvis-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1450,7 +1450,7 @@ rpc-server: examples/rpc/rpc-server.cpp \
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif # GGML_RPC endif # GGML_RPC
llama-server: \ jarvis-server: \
examples/server/server.cpp \ examples/server/server.cpp \
examples/server/utils.hpp \ examples/server/utils.hpp \
examples/server/httplib.h \ examples/server/httplib.h \
@ -1485,7 +1485,7 @@ examples/server/%.hpp: examples/server/public/% Makefile
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@ ) > $@
llama-gen-docs: examples/gen-docs/gen-docs.cpp \ jarvis-gen-docs: examples/gen-docs/gen-docs.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1499,7 +1499,7 @@ libllava.a: examples/llava/llava.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llama-llava-cli: examples/llava/llava-cli.cpp \ jarvis-llava-cli: examples/llava/llava-cli.cpp \
examples/llava/llava.cpp \ examples/llava/llava.cpp \
examples/llava/llava.h \ examples/llava/llava.h \
examples/llava/clip.cpp \ examples/llava/clip.cpp \
@ -1507,7 +1507,7 @@ llama-llava-cli: examples/llava/llava-cli.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ jarvis-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
examples/llava/llava.cpp \ examples/llava/llava.cpp \
examples/llava/llava.h \ examples/llava/llava.h \
examples/llava/clip.cpp \ examples/llava/clip.cpp \
@ -1542,7 +1542,7 @@ tests/test-arg-parser: tests/test-arg-parser.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp \ tests/test-jarvis-grammar: tests/test-jarvis-grammar.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1616,7 +1616,7 @@ tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c include/llama.h tests/test-c.o: tests/test-c.c include/jarvis.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
tests/test-backend-ops: tests/test-backend-ops.cpp \ tests/test-backend-ops: tests/test-backend-ops.cpp \
@ -1643,12 +1643,12 @@ tests/test-chat-template: tests/test-chat-template.cpp \
# PoCs # PoCs
# #
llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \ jarvis-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
$(OBJ_GGML) $(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ jarvis-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
$(OBJ_GGML) $(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1667,17 +1667,17 @@ examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning
# Eventually we will want to remove these target from building all the time. # Eventually we will want to remove these target from building all the time.
main: examples/deprecation-warning/deprecation-warning.o main: examples/deprecation-warning/deprecation-warning.o
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." @echo "NOTICE: The 'main' binary is deprecated. Please use 'jarvis-cli' instead."
server: examples/deprecation-warning/deprecation-warning.o server: examples/deprecation-warning/deprecation-warning.o
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." @echo "NOTICE: The 'server' binary is deprecated. Please use 'jarvis-server' instead."
quantize: examples/deprecation-warning/deprecation-warning.o quantize: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard quantize)) ifneq (,$(wildcard quantize))
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########" @echo "#########"
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." @echo "WARNING: The 'quantize' binary is deprecated. Please use 'jarvis-quantize' instead."
@echo " Remove the 'quantize' binary to remove this warning." @echo " Remove the 'quantize' binary to remove this warning."
@echo "#########" @echo "#########"
endif endif
@ -1686,7 +1686,7 @@ perplexity: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard perplexity)) ifneq (,$(wildcard perplexity))
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########" @echo "#########"
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'jarvis-perplexity' instead."
@echo " Remove the 'perplexity' binary to remove this warning." @echo " Remove the 'perplexity' binary to remove this warning."
@echo "#########" @echo "#########"
endif endif
@ -1695,7 +1695,7 @@ embedding: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard embedding)) ifneq (,$(wildcard embedding))
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########" @echo "#########"
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." @echo "WARNING: The 'embedding' binary is deprecated. Please use 'jarvis-embedding' instead."
@echo " Remove the 'embedding' binary to remove this warning." @echo " Remove the 'embedding' binary to remove this warning."
@echo "#########" @echo "#########"
endif endif

View file

@ -3,10 +3,10 @@
import PackageDescription import PackageDescription
var sources = [ var sources = [
"src/llama.cpp", "src/jarvis.cpp",
"src/llama-vocab.cpp", "src/jarvis-vocab.cpp",
"src/llama-grammar.cpp", "src/jarvis-grammar.cpp",
"src/llama-sampling.cpp", "src/jarvis-sampling.cpp",
"src/unicode.cpp", "src/unicode.cpp",
"src/unicode-data.cpp", "src/unicode-data.cpp",
"ggml/src/ggml.c", "ggml/src/ggml.c",
@ -45,7 +45,7 @@ cSettings.append(
#endif #endif
let package = Package( let package = Package(
name: "llama", name: "jarvis",
platforms: [ platforms: [
.macOS(.v12), .macOS(.v12),
.iOS(.v14), .iOS(.v14),
@ -53,11 +53,11 @@ let package = Package(
.tvOS(.v14) .tvOS(.v14)
], ],
products: [ products: [
.library(name: "llama", targets: ["llama"]), .library(name: "jarvis", targets: ["jarvis"]),
], ],
targets: [ targets: [
.target( .target(
name: "llama", name: "jarvis",
path: ".", path: ".",
exclude: [ exclude: [
"cmake", "cmake",

170
README.md
View file

@ -1,30 +1,30 @@
# llama.cpp # jarvis.cpp
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) ![jarvis](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) [![Server](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp) [![Conan Center](https://shields.io/conan/v/jarvis-cpp)](https://conan.io/center/jarvis-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/jarvis.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/jarvis.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
## Recent API changes ## Recent API changes
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289) - [Changelog for `libjarvis` API](https://github.com/ggerganov/jarvis.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291) - [Changelog for `jarvis-server` REST API](https://github.com/ggerganov/jarvis.cpp/issues/9291)
## Hot topics ## Hot topics
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669** - **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/jarvis.cpp/discussions/9669**
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) - Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/jarvis.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
---- ----
## Description ## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide The main goal of `jarvis.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
variety of hardware - locally and in the cloud. variety of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies - Plain C/C++ implementation without any dependencies
@ -35,7 +35,7 @@ variety of hardware - locally and in the cloud.
- Vulkan and SYCL backend support - Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has Since its [inception](https://github.com/ggerganov/jarvis.cpp/issues/33#issuecomment-1465108022), the project has
improved significantly thanks to many contributions. It is the main playground for developing new features for the improved significantly thanks to many contributions. It is the main playground for developing new features for the
[ggml](https://github.com/ggerganov/ggml) library. [ggml](https://github.com/ggerganov/ggml) library.
@ -52,22 +52,22 @@ Typically finetunes of the base models below are supported as well.
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon) - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423) - [X] [BERT](https://github.com/ggerganov/jarvis.cpp/pull/5423)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) - [X] [Starcoder models](https://github.com/ggerganov/jarvis.cpp/pull/3187)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) - [X] [MPT](https://github.com/ggerganov/jarvis.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) - [X] [Bloom](https://github.com/ggerganov/jarvis.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
- [X] [StableLM models](https://huggingface.co/stabilityai) - [X] [StableLM models](https://huggingface.co/stabilityai)
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) - [x] [PLaMo-13B](https://github.com/ggerganov/jarvis.cpp/pull/3557)
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [GPT-2](https://huggingface.co/gpt2) - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) - [x] [Orion 14B](https://github.com/ggerganov/jarvis.cpp/pull/5118)
- [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
- [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma) - [x] [Gemma](https://ai.google.dev/gemma)
@ -111,36 +111,36 @@ Typically finetunes of the base models below are supported as well.
**Bindings:** **Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Python: [abetlen/jarvis-cpp-python](https://github.com/abetlen/jarvis-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-jarvis.cpp](https://github.com/go-skynet/go-jarvis.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - Node.js: [withcatai/node-jarvis-cpp](https://github.com/withcatai/node-jarvis-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - JS/TS (jarvis.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/jarviscpp)
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli) - JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - JavaScript/Wasm (works in browser): [tangledgroup/jarvis-cpp-wasm](https://github.com/tangledgroup/jarvis-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Typescript/Wasm (nicer API, available on npm): [ngxson/wjarvis](https://github.com/ngxson/wjarvis)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/jarvis_cpp.rb](https://github.com/yoshoku/jarvis_cpp.rb)
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) - Rust (more features): [edgenai/jarvis_cpp-rs](https://github.com/edgenai/jarvis_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (nicer API): [mdrokz/rust-jarvis.cpp](https://github.com/mdrokz/rust-jarvis.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - Rust (more direct bindings): [utilityai/jarvis-cpp-rs](https://github.com/utilityai/jarvis-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/JarvisSharp](https://github.com/SciSharp/JarvisSharp)
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html) - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - Clojure: [phronmophobic/jarvis.clj](https://github.com/phronmophobic/jarvis.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - React Native: [mybigday/jarvis.rn](https://github.com/mybigday/jarvis.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Java: [kherud/java-jarvis.cpp](https://github.com/kherud/java-jarvis.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Zig: [deins/jarvis.cpp.zig](https://github.com/Deins/jarvis.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) - Flutter/Dart: [netdur/jarvis_cpp_dart](https://github.com/netdur/jarvis_cpp_dart)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - PHP (API bindings and features built on top of jarvis.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/jarvis.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) - Guile Scheme: [guile_jarvis_cpp](https://savannah.nongnu.org/projects/guile-jarvis-cpp)
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) - Swift [srgtuszy/jarvis-cpp-swift](https://github.com/srgtuszy/jarvis-cpp-swift)
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) - Swift [ShenghaiWang/SwiftJarvis](https://github.com/ShenghaiWang/SwiftJarvis)
**UI:** **UI:**
Unless otherwise noted these projects are open-source with permissive licensing: Unless otherwise noted these projects are open-source with permissive licensing:
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) - [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA) - [iohub/cojarvis](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL) - [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground) - [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary) - [Faraday](https://faraday.dev/) (proprietary)
@ -149,9 +149,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [ramalama](https://github.com/containers/ramalama) (MIT) - [ramalama](https://github.com/containers/ramalama) (MIT)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) - [Mozilla-Ocho/jarvisfile](https://github.com/Mozilla-Ocho/jarvisfile)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
- [ollama/ollama](https://github.com/ollama/ollama) - [ojarvis/ojarvis](https://github.com/ojarvis/ojarvis)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
@ -173,24 +173,24 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [AIKit](https://github.com/sozercan/aikit) (MIT) - [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) - [Jarvis Assistant](https://github.com/vietanhdev/jarvis-assistant) (GPL)
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT) - [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* *(to have a project listed here, it should clearly state that it depends on `jarvis.cpp`)*
**Tools:** **Tools:**
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML - [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) download models from the Ollama library to be used directly with llama.cpp - [akx/ojarvis-dl](https://github.com/akx/ojarvis-dl) download models from the Ojarvis library to be used directly with jarvis.cpp
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [crashr/gppm](https://github.com/crashr/gppm) launch jarvis.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example) - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-jarvis-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
**Infrastructure:** **Infrastructure:**
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for jarvis.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly - [jarvis_cpp_canister](https://github.com/onicai/jarvis_cpp_canister) - jarvis.cpp as a smart contract on the Internet Computer, using WebAssembly
**Games:** **Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
@ -201,8 +201,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary> <summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
``` ```
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e $ make -j && ./jarvis-cli -m models/jarvis-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
I llama.cpp build info: I jarvis.cpp build info:
I UNAME_S: Darwin I UNAME_S: Darwin
I UNAME_P: arm I UNAME_P: arm
I UNAME_M: arm64 I UNAME_M: arm64
@ -215,12 +215,12 @@ I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
make: Nothing to be done for `default'. make: Nothing to be done for `default'.
main: build = 1041 (cf658ad) main: build = 1041 (cf658ad)
main: seed = 1692823051 main: seed = 1692823051
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest)) jarvis_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/jarvis-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
llama_model_loader: - type f32: 81 tensors jarvis_model_loader: - type f32: 81 tensors
llama_model_loader: - type q4_0: 281 tensors jarvis_model_loader: - type q4_0: 281 tensors
llama_model_loader: - type q6_K: 1 tensors jarvis_model_loader: - type q6_K: 1 tensors
llm_load_print_meta: format = GGUF V1 (latest) llm_load_print_meta: format = GGUF V1 (latest)
llm_load_print_meta: arch = llama llm_load_print_meta: arch = jarvis
llm_load_print_meta: vocab type = SPM llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_merges = 0
@ -248,8 +248,8 @@ llm_load_print_meta: LF token = 13 '<0x0A>'
llm_load_tensors: ggml ctx size = 0.11 MB llm_load_tensors: ggml ctx size = 0.11 MB
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state) llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
................................................................................................... ...................................................................................................
llama_new_context_with_model: kv self size = 400.00 MB jarvis_new_context_with_model: kv self size = 400.00 MB
llama_new_context_with_model: compute buffer total size = 75.41 MB jarvis_new_context_with_model: compute buffer total size = 75.41 MB
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000 sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
@ -271,11 +271,11 @@ How does a Website Work?
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit whether its an image or text file (like PDFs). In order for someone elses browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable! A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit whether its an image or text file (like PDFs). In order for someone elses browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking. The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
How to How to
llama_print_timings: load time = 576.45 ms jarvis_print_timings: load time = 576.45 ms
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second) jarvis_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second) jarvis_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second) jarvis_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
llama_print_timings: total time = 25431.49 ms jarvis_print_timings: total time = 25431.49 ms
``` ```
</details> </details>
@ -297,14 +297,14 @@ Here are the end-to-end binary build and model conversion steps for most support
Firstly, you need to get the binary. There are different methods that you can follow: Firstly, you need to get the binary. There are different methods that you can follow:
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md) - Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md) - Method 2: If you are using MacOS or Linux, you can install jarvis.cpp via [brew, flox or nix](./docs/install.md)
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md) - Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases) - Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/jarvis.cpp/releases)
You can run a basic completion using this command: You can run a basic completion using this command:
```bash ```bash
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128 jarvis-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
# Output: # Output:
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey. # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
@ -317,7 +317,7 @@ See [this page](./examples/main/README.md) for a full list of parameters.
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter: If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
```bash ```bash
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
# Output: # Output:
# > hi, who are you? # > hi, who are you?
@ -327,26 +327,26 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
# Easy peasy! The answer to 1+1 is... 2! # Easy peasy! The answer to 1+1 is... 2!
``` ```
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template)
```bash ```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml ./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
``` ```
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters: You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
```bash ```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:' ./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
``` ```
### Web server ### Web server
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. [jarvis.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
Example usage: Example usage:
```bash ```bash
./llama-server -m your_model.gguf --port 8080 ./jarvis-server -m your_model.gguf --port 8080
# Basic web UI can be accessed via browser: http://localhost:8080 # Basic web UI can be accessed via browser: http://localhost:8080
# Chat completion endpoint: http://localhost:8080/v1/chat/completions # Chat completion endpoint: http://localhost:8080/v1/chat/completions
@ -369,16 +369,16 @@ Here is an example of a few-shot interaction, invoked with the command
./examples/chat-13B.sh ./examples/chat-13B.sh
# custom arguments using a 13B model # custom arguments using a 13B model
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
``` ```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program. Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `jarvis-cli` example program.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction ### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. The prompt, user inputs, and model generations can be saved and resumed across calls to `./jarvis-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
```bash ```bash
# Start a new chat # Start a new chat
@ -397,10 +397,10 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
### Constrained output with grammars ### Constrained output with grammars
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: `jarvis.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
```bash ```bash
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' ./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
``` ```
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
@ -409,7 +409,7 @@ For authoring more complex JSON grammars, you can also check out https://grammar
## Build ## Build
Please refer to [Build llama.cpp locally](./docs/build.md) Please refer to [Build jarvis.cpp locally](./docs/build.md)
## Supported backends ## Supported backends
@ -430,11 +430,11 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
### Prepare and Quantize ### Prepare and Quantize
> [!NOTE] > [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours. > You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `jarvis.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-jarvis-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives. Note: `convert.py` has been moved to `examples/convert_legacy_jarvis.py` and shouldn't be used for anything other than `Jarvis/Jarvis2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face. It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md) To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
@ -444,17 +444,17 @@ To learn more about quantizing model, [read this documentation](./examples/quant
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better). You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md) To learn more how to measure perplexity using jarvis.cpp, [read this documentation](./examples/perplexity/README.md)
## Contributing ## Contributing
- Contributors can open PRs - Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators can push to branches in the `jarvis.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions - Collaborators will be invited based on contributions
- Any help with managing issues, PRs and projects is very appreciated! - Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - See [good first issues](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/jarvis.cpp/discussions/205)
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
## Other documentations ## Other documentations
@ -470,13 +470,13 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
- [Running on Docker](./docs/docker.md) - [Running on Docker](./docs/docker.md)
- [Build on Android](./docs/android.md) - [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md) - [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) - [GGML tips & tricks](https://github.com/ggerganov/jarvis.cpp/wiki/GGML-Tips-&-Tricks)
**Seminal papers and background on the models** **Seminal papers and background on the models**
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA: - LLaMA:
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-jarvis-meta-ai/)
- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
- GPT-3 - GPT-3
- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)

View file

@ -1,6 +1,6 @@
# Security Policy # Security Policy
- [**Using llama.cpp securely**](#using-llamacpp-securely) - [**Using jarvis.cpp securely**](#using-jarviscpp-securely)
- [Untrusted models](#untrusted-models) - [Untrusted models](#untrusted-models)
- [Untrusted inputs](#untrusted-inputs) - [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy) - [Data privacy](#data-privacy)
@ -8,7 +8,7 @@
- [Multi-Tenant environments](#multi-tenant-environments) - [Multi-Tenant environments](#multi-tenant-environments)
- [**Reporting a vulnerability**](#reporting-a-vulnerability) - [**Reporting a vulnerability**](#reporting-a-vulnerability)
## Using llama.cpp securely ## Using jarvis.cpp securely
### Untrusted models ### Untrusted models
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources. Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
@ -57,11 +57,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
## Reporting a vulnerability ## Reporting a vulnerability
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++. Beware that none of the topics under [Using jarvis.cpp securely](#using-jarviscpp-securely) are considered vulnerabilities of LLaMA C++.
<!-- normal version --> <!-- normal version -->
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new). Please disclose it as a private [security advisory](https://github.com/ggerganov/jarvis.cpp/security/advisories/new).
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

View file

@ -1,11 +1,11 @@
# CI # CI
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework: In addition to [Github Actions](https://github.com/ggerganov/jarvis.cpp/actions) `jarvis.cpp` uses a custom CI framework:
https://github.com/ggml-org/ci https://github.com/ggml-org/ci
It monitors the `master` branch for new commits and runs the It monitors the `master` branch for new commits and runs the
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us [ci/run.sh](https://github.com/ggerganov/jarvis.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
to cover various hardware architectures, including GPU and Apple Silicon instances. to cover various hardware architectures, including GPU and Apple Silicon instances.

268
ci/run.sh
View file

@ -36,7 +36,7 @@ sd=`dirname $0`
cd $sd/../ cd $sd/../
SRC=`pwd` SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" CMAKE_EXTRA="-DJARVIS_FATAL_WARNINGS=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@ -217,7 +217,7 @@ function gg_sum_test_scripts_release {
function gg_get_model { function gg_get_model {
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf" local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf" local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" local gguf_2="$MNT/models/open-jarvis/7B-v2/ggml-model-f16.gguf"
if [[ -s $gguf_0 ]]; then if [[ -s $gguf_0 ]]; then
echo -n "$gguf_0" echo -n "$gguf_0"
elif [[ -s $gguf_1 ]]; then elif [[ -s $gguf_1 ]]; then
@ -236,7 +236,7 @@ function gg_run_ctest_with_model_debug {
local model; model=$(gg_get_model) local model; model=$(gg_get_model)
cd build-ci-debug cd build-ci-debug
set -e set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log (JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e set +e
cd .. cd ..
} }
@ -247,7 +247,7 @@ function gg_run_ctest_with_model_release {
local model; model=$(gg_get_model) local model; model=$(gg_get_model)
cd build-ci-release cd build-ci-release
set -e set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log (JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e set +e
cd .. cd ..
} }
@ -272,24 +272,24 @@ function gg_sum_ctest_with_model_release {
gg_printf '```\n' gg_printf '```\n'
} }
# open_llama_7b_v2 # open_jarvis_7b_v2
function gg_run_open_llama_7b_v2 { function gg_run_open_jarvis_7b_v2 {
cd ${SRC} cd ${SRC}
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2" path_models="../models-mnt/open-jarvis/7B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw" path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf python3 ../examples/convert_legacy_jarvis.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -315,47 +315,47 @@ function gg_run_open_llama_7b_v2 {
wiki_test="${path_wiki}/wiki.test.raw" wiki_test="${path_wiki}/wiki.test.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/jarvis-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -387,7 +387,7 @@ function gg_run_open_llama_7b_v2 {
set +e set +e
} }
function gg_sum_open_llama_7b_v2 { function gg_sum_open_jarvis_7b_v2 {
gg_printf '### %s\n\n' "${ci}" gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 7B-v2:\n' gg_printf 'OpenLLaMA 7B-v2:\n'
@ -449,45 +449,45 @@ function gg_run_pythia_1_4b {
wiki_test_60="${path_wiki}/wiki.test-60.raw" wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/jarvis-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/jarvis-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/jarvis-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/jarvis-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/jarvis-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/jarvis-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/jarvis-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/jarvis-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/jarvis-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/jarvis-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/jarvis-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -580,47 +580,47 @@ function gg_run_pythia_2_8b {
wiki_test="${path_wiki}/wiki.test.raw" wiki_test="${path_wiki}/wiki.test.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0 ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1 ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0 ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1 ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/jarvis-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -704,10 +704,10 @@ function gg_run_embd_bge_small {
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0 ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/jarvis-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/jarvis-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e set +e
} }
@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
# for this model, the SEP token is "</s>" # for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log (time ./bin/jarvis-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output # sample output
# rerank score 0: 0.029 # rerank score 0: 0.029
@ -804,11 +804,11 @@ function gg_check_build_requirements {
## main ## main
export LLAMA_LOG_PREFIX=1 export JARVIS_LOG_PREFIX=1
export LLAMA_LOG_TIMESTAMPS=1 export JARVIS_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt # Create symlink: ./jarvis.cpp/models-mnt -> $MNT/models/models-mnt
rm -rf ${SRC}/models-mnt rm -rf ${SRC}/models-mnt
mnt_models=${MNT}/models mnt_models=${MNT}/models
mkdir -p ${mnt_models} mkdir -p ${mnt_models}
@ -841,7 +841,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run pythia_1_4b test $ret -eq 0 && gg_run pythia_1_4b
else else
test $ret -eq 0 && gg_run pythia_2_8b test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2 #test $ret -eq 0 && gg_run open_jarvis_7b_v2
fi fi
test $ret -eq 0 && gg_run ctest_with_model_debug test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release test $ret -eq 0 && gg_run ctest_with_model_release

View file

@ -1,7 +1,7 @@
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@) set(JARVIS_VERSION @JARVIS_INSTALL_VERSION@)
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(JARVIS_BUILD_COMMIT @JARVIS_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(JARVIS_BUILD_NUMBER @JARVIS_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(JARVIS_SHARED_LIB @BUILD_SHARED_LIBS@)
set(GGML_BLAS @GGML_BLAS@) set(GGML_BLAS @GGML_BLAS@)
set(GGML_CUDA @GGML_CUDA@) set(GGML_CUDA @GGML_CUDA@)
@ -18,9 +18,9 @@ set(GGML_OPENMP @GGML_OPENMP@)
@PACKAGE_INIT@ @PACKAGE_INIT@
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") set_and_check(JARVIS_INCLUDE_DIR "@PACKAGE_JARVIS_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(JARVIS_LIB_DIR "@PACKAGE_JARVIS_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") set_and_check(JARVIS_BIN_DIR "@PACKAGE_JARVIS_BIN_INSTALL_DIR@")
# Ensure transient dependencies satisfied # Ensure transient dependencies satisfied
@ -66,25 +66,25 @@ endif()
find_library(ggml_LIBRARY ggml find_library(ggml_LIBRARY ggml
REQUIRED REQUIRED
HINTS ${LLAMA_LIB_DIR}) HINTS ${JARVIS_LIB_DIR})
find_library(llama_LIBRARY llama find_library(jarvis_LIBRARY jarvis
REQUIRED REQUIRED
HINTS ${LLAMA_LIB_DIR}) HINTS ${JARVIS_LIB_DIR})
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@") set(_jarvis_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") set(_jarvis_transient_defines "@GGML_TRANSIENT_DEFINES@")
add_library(llama UNKNOWN IMPORTED) add_library(jarvis UNKNOWN IMPORTED)
set_target_properties(llama set_target_properties(jarvis
PROPERTIES PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" INTERFACE_INCLUDE_DIRECTORIES "${JARVIS_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" INTERFACE_LINK_LIBRARIES "${_jarvis_link_deps}"
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" INTERFACE_COMPILE_DEFINITIONS "${_jarvis_transient_defines}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${llama_LIBRARY}" IMPORTED_LOCATION "${jarvis_LIBRARY}"
INTERFACE_COMPILE_FEATURES cxx_std_11 INTERFACE_COMPILE_FEATURES cxx_std_11
POSITION_INDEPENDENT_CODE ON ) POSITION_INDEPENDENT_CODE ON )
check_required_components(Llama) check_required_components(Jarvis)

View file

@ -3,8 +3,8 @@ exec_prefix=${prefix}
libdir=${exec_prefix}/lib libdir=${exec_prefix}/lib
includedir=${prefix}/include includedir=${prefix}/include
Name: llama Name: jarvis
Description: Port of Facebook's LLaMA model in C/C++ Description: Port of Facebook's LLaMA model in C/C++
Version: @PROJECT_VERSION@ Version: @PROJECT_VERSION@
Libs: -L${libdir} -lllama Libs: -L${libdir} -ljarvis
Cflags: -I${includedir} Cflags: -I${includedir}

View file

@ -74,17 +74,17 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif() endif()
set(LLAMA_COMMON_EXTRA_LIBS build_info) set(JARVIS_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url # Use curl to download model url
if (LLAMA_CURL) if (JARVIS_CURL)
find_package(CURL REQUIRED) find_package(CURL REQUIRED)
add_definitions(-DLLAMA_USE_CURL) add_definitions(-DJARVIS_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS}) include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED) find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) set(JARVIS_COMMON_EXTRA_LIBS ${JARVIS_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif () endif ()
target_include_directories(${TARGET} PUBLIC .) target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_11) target_compile_features (${TARGET} PUBLIC cxx_std_11)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) target_link_libraries (${TARGET} PRIVATE ${JARVIS_COMMON_EXTRA_LIBS} PUBLIC jarvis Threads::Threads)

File diff suppressed because it is too large Load diff

View file

@ -11,7 +11,7 @@
// //
struct common_arg { struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON}; std::set<enum jarvis_example> examples = {JARVIS_EXAMPLE_COMMON};
std::vector<const char *> args; std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value const char * value_hint_2 = nullptr; // for second arg value
@ -52,17 +52,17 @@ struct common_arg {
void (*handler)(common_params & params, const std::string &, const std::string &) void (*handler)(common_params & params, const std::string &, const std::string &)
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
common_arg & set_examples(std::initializer_list<enum llama_example> examples); common_arg & set_examples(std::initializer_list<enum jarvis_example> examples);
common_arg & set_env(const char * env); common_arg & set_env(const char * env);
common_arg & set_sparam(); common_arg & set_sparam();
bool in_example(enum llama_example ex); bool in_example(enum jarvis_example ex);
bool get_value_from_env(std::string & output); bool get_value_from_env(std::string & output);
bool has_value_from_env(); bool has_value_from_env();
std::string to_string(); std::string to_string();
}; };
struct common_params_context { struct common_params_context {
enum llama_example ex = LLAMA_EXAMPLE_COMMON; enum jarvis_example ex = JARVIS_EXAMPLE_COMMON;
common_params & params; common_params & params;
std::vector<common_arg> options; std::vector<common_arg> options;
void(*print_usage)(int, char **) = nullptr; void(*print_usage)(int, char **) = nullptr;
@ -71,7 +71,7 @@ struct common_params_context {
// parse input arguments from CLI // parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); bool common_params_parse(int argc, char ** argv, common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);
// function to be used by test-arg-parser // function to be used by test-arg-parser
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); common_params_context common_params_parser_init(common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);

View file

@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@; int JARVIS_BUILD_NUMBER = @BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@BUILD_COMMIT@"; char const *JARVIS_COMMIT = "@BUILD_COMMIT@";
char const *LLAMA_COMPILER = "@BUILD_COMPILER@"; char const *JARVIS_COMPILER = "@BUILD_COMPILER@";
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@"; char const *JARVIS_BUILD_TARGET = "@BUILD_TARGET@";

View file

@ -8,7 +8,7 @@
#define JSON_ASSERT GGML_ASSERT #define JSON_ASSERT GGML_ASSERT
#include "json.hpp" #include "json.hpp"
#include "json-schema-to-grammar.h" #include "json-schema-to-grammar.h"
#include "llama.h" #include "jarvis.h"
#include <algorithm> #include <algorithm>
#include <cinttypes> #include <cinttypes>
@ -48,7 +48,7 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
#if defined(LLAMA_USE_CURL) #if defined(JARVIS_USE_CURL)
#include <curl/curl.h> #include <curl/curl.h>
#include <curl/easy.h> #include <curl/easy.h>
#include <future> #include <future>
@ -58,7 +58,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if defined(LLAMA_USE_CURL) #if defined(JARVIS_USE_CURL)
#ifdef __linux__ #ifdef __linux__
#include <linux/limits.h> #include <linux/limits.h>
#elif defined(_WIN32) #elif defined(_WIN32)
@ -66,8 +66,8 @@
#else #else
#include <sys/syslimits.h> #include <sys/syslimits.h>
#endif #endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 #define JARVIS_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
#endif // LLAMA_USE_CURL #endif // JARVIS_USE_CURL
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
@ -364,8 +364,8 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
} }
void common_init() { void common_init() {
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { jarvis_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) { if (LOG_DEFAULT_JARVIS <= common_log_verbosity_thold) {
common_log_add(common_log_main(), level, "%s", text); common_log_add(common_log_main(), level, "%s", text);
} }
}, NULL); }, NULL);
@ -376,7 +376,7 @@ void common_init() {
const char * build_type = " (debug)"; const char * build_type = " (debug)";
#endif #endif
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); LOG_INF("build: %d (%s) with %s for %s%s\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT, JARVIS_COMPILER, JARVIS_BUILD_TARGET, build_type);
} }
std::string common_params_get_system_info(const common_params & params) { std::string common_params_get_system_info(const common_params & params) {
@ -389,9 +389,9 @@ std::string common_params_get_system_info(const common_params & params) {
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64 // TODO: windows + arm64 + mingw64
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
os << " / " << logicalProcessorCount << " | " << llama_print_system_info(); os << " / " << logicalProcessorCount << " | " << jarvis_print_system_info();
#else #else
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); os << " / " << std::thread::hardware_concurrency() << " | " << jarvis_print_system_info();
#endif #endif
return os.str(); return os.str();
@ -483,7 +483,7 @@ std::string string_from(const std::vector<int> & values) {
return buf.str(); return buf.str();
} }
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) { std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens) {
std::stringstream buf; std::stringstream buf;
buf << "[ "; buf << "[ ";
@ -514,7 +514,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
return buf.str(); return buf.str();
} }
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) { std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch) {
std::stringstream buf; std::stringstream buf;
buf << "[ "; buf << "[ ";
@ -586,27 +586,27 @@ void string_process_escapes(std::string & input) {
input.resize(output_idx); input.resize(output_idx);
} }
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) { bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides) {
const char * sep = strchr(data, '='); const char * sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) { if (sep == nullptr || sep - data >= 128) {
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data); LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
return false; return false;
} }
llama_model_kv_override kvo; jarvis_model_kv_override kvo;
std::strncpy(kvo.key, data, sep - data); std::strncpy(kvo.key, data, sep - data);
kvo.key[sep - data] = 0; kvo.key[sep - data] = 0;
sep++; sep++;
if (strncmp(sep, "int:", 4) == 0) { if (strncmp(sep, "int:", 4) == 0) {
sep += 4; sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; kvo.tag = JARVIS_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = std::atol(sep); kvo.val_i64 = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) { } else if (strncmp(sep, "float:", 6) == 0) {
sep += 6; sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; kvo.tag = JARVIS_KV_OVERRIDE_TYPE_FLOAT;
kvo.val_f64 = std::atof(sep); kvo.val_f64 = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) { } else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5; sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; kvo.tag = JARVIS_KV_OVERRIDE_TYPE_BOOL;
if (std::strcmp(sep, "true") == 0) { if (std::strcmp(sep, "true") == 0) {
kvo.val_bool = true; kvo.val_bool = true;
} else if (std::strcmp(sep, "false") == 0) { } else if (std::strcmp(sep, "false") == 0) {
@ -617,7 +617,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
} }
} else if (strncmp(sep, "str:", 4) == 0) { } else if (strncmp(sep, "str:", 4) == 0) {
sep += 4; sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; kvo.tag = JARVIS_KV_OVERRIDE_TYPE_STR;
if (strlen(sep) > 127) { if (strlen(sep) > 127) {
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
return false; return false;
@ -788,8 +788,8 @@ std::string fs_get_cache_directory() {
} }
return p; return p;
}; };
if (getenv("LLAMA_CACHE")) { if (getenv("JARVIS_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE"); cache_directory = std::getenv("JARVIS_CACHE");
} else { } else {
#ifdef __linux__ #ifdef __linux__
if (std::getenv("XDG_CACHE_HOME")) { if (std::getenv("XDG_CACHE_HOME")) {
@ -803,7 +803,7 @@ std::string fs_get_cache_directory() {
cache_directory = std::getenv("LOCALAPPDATA"); cache_directory = std::getenv("LOCALAPPDATA");
#endif // __linux__ #endif // __linux__
cache_directory = ensure_trailing_slash(cache_directory); cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp"; cache_directory += "jarvis.cpp";
} }
return ensure_trailing_slash(cache_directory); return ensure_trailing_slash(cache_directory);
} }
@ -824,16 +824,16 @@ std::string fs_get_cache_file(const std::string & filename) {
// //
struct common_init_result common_init_from_params(common_params & params) { struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams; common_init_result iparams;
auto mparams = common_model_params_to_llama(params); auto mparams = common_model_params_to_jarvis(params);
llama_model * model = nullptr; jarvis_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) { if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else { } else {
model = llama_load_model_from_file(params.model.c_str(), mparams); model = jarvis_load_model_from_file(params.model.c_str(), mparams);
} }
if (model == NULL) { if (model == NULL) {
@ -844,58 +844,58 @@ struct common_init_result common_init_from_params(common_params & params) {
if (params.reranking) { if (params.reranking) {
bool ok = true; bool ok = true;
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) { if (jarvis_token_bos(model) == JARVIS_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__); LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) { if (jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__); LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) { if (jarvis_token_sep(model) == JARVIS_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__); LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
if (!ok) { if (!ok) {
llama_free_model(model); jarvis_free_model(model);
return iparams; return iparams;
} }
} }
auto cparams = common_context_params_to_llama(params); auto cparams = common_context_params_to_jarvis(params);
llama_context * lctx = llama_new_context_with_model(model, cparams); jarvis_context * lctx = jarvis_new_context_with_model(model, cparams);
if (lctx == NULL) { if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model); jarvis_free_model(model);
return iparams; return iparams;
} }
if (!params.control_vectors.empty()) { if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = jarvis_n_layer(model);
const auto cvec = common_control_vector_load(params.control_vectors); const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) { if (cvec.n_embd == -1) {
llama_free(lctx); jarvis_free(lctx);
llama_free_model(model); jarvis_free_model(model);
return iparams; return iparams;
} }
int err = llama_control_vector_apply(lctx, int err = jarvis_control_vector_apply(lctx,
cvec.data.data(), cvec.data.data(),
cvec.data.size(), cvec.data.size(),
cvec.n_embd, cvec.n_embd,
params.control_vector_layer_start, params.control_vector_layer_start,
params.control_vector_layer_end); params.control_vector_layer_end);
if (err) { if (err) {
llama_free(lctx); jarvis_free(lctx);
llama_free_model(model); jarvis_free_model(model);
return iparams; return iparams;
} }
@ -906,11 +906,11 @@ struct common_init_result common_init_from_params(common_params & params) {
common_lora_adapter_container loaded_la; common_lora_adapter_container loaded_la;
loaded_la.path = la.path; loaded_la.path = la.path;
loaded_la.scale = la.scale; loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); loaded_la.adapter = jarvis_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) { if (loaded_la.adapter == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx); jarvis_free(lctx);
llama_free_model(model); jarvis_free_model(model);
return iparams; return iparams;
} }
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
@ -919,7 +919,7 @@ struct common_init_result common_init_from_params(common_params & params) {
common_lora_adapters_apply(lctx, iparams.lora_adapters); common_lora_adapters_apply(lctx, iparams.lora_adapters);
} }
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { if (params.sparams.ignore_eos && jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sparams.ignore_eos = false; params.sparams.ignore_eos = false;
} }
@ -927,35 +927,35 @@ struct common_init_result common_init_from_params(common_params & params) {
if (params.warmup) { if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
std::vector<llama_token> tmp; std::vector<jarvis_token> tmp;
llama_token bos = llama_token_bos(model); jarvis_token bos = jarvis_token_bos(model);
llama_token eos = llama_token_eos(model); jarvis_token eos = jarvis_token_eos(model);
// some models (e.g. T5) don't have a BOS token // some models (e.g. T5) don't have a BOS token
if (bos != LLAMA_TOKEN_NULL) { if (bos != JARVIS_TOKEN_NULL) {
tmp.push_back(bos); tmp.push_back(bos);
} }
if (eos != LLAMA_TOKEN_NULL) { if (eos != JARVIS_TOKEN_NULL) {
tmp.push_back(eos); tmp.push_back(eos);
} }
if (tmp.empty()) { if (tmp.empty()) {
tmp.push_back(0); tmp.push_back(0);
} }
if (llama_model_has_encoder(model)) { if (jarvis_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size())); jarvis_encode(lctx, jarvis_batch_get_one(tmp.data(), tmp.size()));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model); jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model);
if (decoder_start_token_id == -1) { if (decoder_start_token_id == -1) {
decoder_start_token_id = bos; decoder_start_token_id = bos;
} }
tmp.clear(); tmp.clear();
tmp.push_back(decoder_start_token_id); tmp.push_back(decoder_start_token_id);
} }
if (llama_model_has_decoder(model)) { if (jarvis_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); jarvis_decode(lctx, jarvis_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
} }
llama_kv_cache_clear(lctx); jarvis_kv_cache_clear(lctx);
llama_synchronize(lctx); jarvis_synchronize(lctx);
llama_perf_context_reset(lctx); jarvis_perf_context_reset(lctx);
} }
iparams.model = model; iparams.model = model;
@ -964,17 +964,17 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams; return iparams;
} }
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) { void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
llama_lora_adapter_clear(ctx); jarvis_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) { for (auto & la : lora_adapters) {
if (la.scale != 0.0f) { if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.adapter, la.scale); jarvis_lora_adapter_set(ctx, la.adapter, la.scale);
} }
} }
} }
struct llama_model_params common_model_params_to_llama(const common_params & params) { struct jarvis_model_params common_model_params_to_jarvis(const common_params & params) {
auto mparams = llama_model_default_params(); auto mparams = jarvis_model_default_params();
if (params.n_gpu_layers != -1) { if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers; mparams.n_gpu_layers = params.n_gpu_layers;
@ -1025,8 +1025,8 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
throw std::runtime_error("Unsupported cache type: " + s); throw std::runtime_error("Unsupported cache type: " + s);
} }
struct llama_context_params common_context_params_to_llama(const common_params & params) { struct jarvis_context_params common_context_params_to_jarvis(const common_params & params) {
auto cparams = llama_context_default_params(); auto cparams = jarvis_context_default_params();
cparams.n_ctx = params.n_ctx; cparams.n_ctx = params.n_ctx;
cparams.n_seq_max = params.n_parallel; cparams.n_seq_max = params.n_parallel;
@ -1056,7 +1056,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
if (params.reranking) { if (params.reranking) {
cparams.embeddings = true; cparams.embeddings = true;
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; cparams.pooling_type = JARVIS_POOLING_TYPE_RANK;
} }
cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@ -1081,7 +1081,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
return tpp; return tpp;
} }
#ifdef LLAMA_USE_CURL #ifdef JARVIS_USE_CURL
#define CURL_MAX_RETRY 3 #define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2 #define CURL_RETRY_DELAY_SECONDS 2
@ -1279,7 +1279,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
// helper function to hide password in URL // helper function to hide password in URL
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { auto jarvis_download_hide_password_in_url = [](const std::string & url) -> std::string {
std::size_t protocol_pos = url.find("://"); std::size_t protocol_pos = url.find("://");
if (protocol_pos == std::string::npos) { if (protocol_pos == std::string::npos) {
return url; // Malformed URL return url; // Malformed URL
@ -1295,7 +1295,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
// start the download // start the download
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); jarvis_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) { if (!was_perform_successful) {
return false; return false;
@ -1329,11 +1329,11 @@ static bool common_download_file(const std::string & url, const std::string & pa
return true; return true;
} }
struct llama_model * common_load_model_from_url( struct jarvis_model * common_load_model_from_url(
const char * model_url, const char * model_url,
const char * path_model, const char * path_model,
const char * hf_token, const char * hf_token,
const struct llama_model_params & params) { const struct jarvis_model_params & params) {
// Basic validation of the model_url // Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) { if (!model_url || strlen(model_url) == 0) {
LOG_ERR("%s: invalid model_url\n", __func__); LOG_ERR("%s: invalid model_url\n", __func__);
@ -1367,17 +1367,17 @@ struct llama_model * common_load_model_from_url(
if (n_split > 1) { if (n_split > 1) {
char split_prefix[PATH_MAX] = {0}; char split_prefix[PATH_MAX] = {0};
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; char split_url_prefix[JARVIS_CURL_MAX_URL_LENGTH] = {0};
// Verify the first split file format // Verify the first split file format
// and extract split URL and PATH prefixes // and extract split URL and PATH prefixes
{ {
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { if (!jarvis_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split); LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
return NULL; return NULL;
} }
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { if (!jarvis_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split); LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
return NULL; return NULL;
} }
@ -1388,10 +1388,10 @@ struct llama_model * common_load_model_from_url(
for (int idx = 1; idx < n_split; idx++) { for (int idx = 1; idx < n_split; idx++) {
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool { futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
char split_path[PATH_MAX] = {0}; char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); jarvis_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; char split_url[JARVIS_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); jarvis_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
return common_download_file(split_url, split_path, hf_token); return common_download_file(split_url, split_path, hf_token);
}, idx)); }, idx));
@ -1405,19 +1405,19 @@ struct llama_model * common_load_model_from_url(
} }
} }
return llama_load_model_from_file(path_model, params); return jarvis_load_model_from_file(path_model, params);
} }
struct llama_model * common_load_model_from_hf( struct jarvis_model * common_load_model_from_hf(
const char * repo, const char * repo,
const char * model, const char * model,
const char * path_model, const char * path_model,
const char * hf_token, const char * hf_token,
const struct llama_model_params & params) { const struct jarvis_model_params & params) {
// construct hugging face model url: // construct hugging face model url:
// //
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf // --repo ggml-org/models --file tinyjarvis-1.1b/ggml-model-f16.gguf
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf // https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf
// //
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
@ -1433,42 +1433,42 @@ struct llama_model * common_load_model_from_hf(
#else #else
struct llama_model * common_load_model_from_url( struct jarvis_model * common_load_model_from_url(
const char * /*model_url*/, const char * /*model_url*/,
const char * /*path_model*/, const char * /*path_model*/,
const char * /*hf_token*/, const char * /*hf_token*/,
const struct llama_model_params & /*params*/) { const struct jarvis_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr; return nullptr;
} }
struct llama_model * common_load_model_from_hf( struct jarvis_model * common_load_model_from_hf(
const char * /*repo*/, const char * /*repo*/,
const char * /*model*/, const char * /*model*/,
const char * /*path_model*/, const char * /*path_model*/,
const char * /*hf_token*/, const char * /*hf_token*/,
const struct llama_model_params & /*params*/) { const struct jarvis_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr; return nullptr;
} }
#endif // LLAMA_USE_CURL #endif // JARVIS_USE_CURL
// //
// Batch utils // Batch utils
// //
void common_batch_clear(struct llama_batch & batch) { void common_batch_clear(struct jarvis_batch & batch) {
batch.n_tokens = 0; batch.n_tokens = 0;
} }
void common_batch_add( void common_batch_add(
struct llama_batch & batch, struct jarvis_batch & batch,
llama_token id, jarvis_token id,
llama_pos pos, jarvis_pos pos,
const std::vector<llama_seq_id> & seq_ids, const std::vector<jarvis_seq_id> & seq_ids,
bool logits) { bool logits) {
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded"); GGML_ASSERT(batch.seq_id[batch.n_tokens] && "jarvis_batch size exceeded");
batch.token [batch.n_tokens] = id; batch.token [batch.n_tokens] = id;
batch.pos [batch.n_tokens] = pos; batch.pos [batch.n_tokens] = pos;
@ -1485,26 +1485,26 @@ void common_batch_add(
// Vocab utils // Vocab utils
// //
std::vector<llama_token> common_tokenize( std::vector<jarvis_token> common_tokenize(
const struct llama_context * ctx, const struct jarvis_context * ctx,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special); return common_tokenize(jarvis_get_model(ctx), text, add_special, parse_special);
} }
std::vector<llama_token> common_tokenize( std::vector<jarvis_token> common_tokenize(
const struct llama_model * model, const struct jarvis_model * model,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
// upper limit for the number of tokens // upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special; int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens); std::vector<jarvis_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); n_tokens = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) { if (n_tokens < 0) {
result.resize(-n_tokens); result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); int check = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens); GGML_ASSERT(check == -n_tokens);
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);
@ -1512,13 +1512,13 @@ std::vector<llama_token> common_tokenize(
return result; return result;
} }
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string common_token_to_piece(const struct jarvis_context * ctx, jarvis_token token, bool special) {
std::string piece; std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); const int n_chars = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) { if (n_chars < 0) {
piece.resize(-n_chars); piece.resize(-n_chars);
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); int check = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars); GGML_ASSERT(check == -n_chars);
} }
else { else {
@ -1528,13 +1528,13 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
return piece; return piece;
} }
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) { std::string common_detokenize(jarvis_context * ctx, const std::vector<jarvis_token> & tokens, bool special) {
std::string text; std::string text;
text.resize(std::max(text.capacity(), tokens.size())); text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); int32_t n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) { if (n_chars < 0) {
text.resize(-n_chars); text.resize(-n_chars);
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
} }
@ -1549,18 +1549,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
// //
bool common_chat_verify_template(const std::string & tmpl) { bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}}; jarvis_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); int res = jarvis_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0; return res >= 0;
} }
std::string common_chat_apply_template(const struct llama_model * model, std::string common_chat_apply_template(const struct jarvis_model * model,
const std::string & tmpl, const std::string & tmpl,
const std::vector<common_chat_msg> & msgs, const std::vector<common_chat_msg> & msgs,
bool add_ass) { bool add_ass) {
int alloc_size = 0; int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat; std::vector<jarvis_chat_message> chat;
for (auto & msg : msgs) { for (auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()}); chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25; alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
@ -1570,17 +1570,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
std::vector<char> buf(alloc_size); std::vector<char> buf(alloc_size);
// run the first time to get the total output length // run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); int32_t res = jarvis_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// error: chat template is not supported // error: chat template is not supported
if (res < 0) { if (res < 0) {
if (ptr_tmpl != nullptr) { if (ptr_tmpl != nullptr) {
// if the custom "tmpl" is not supported, we throw an error // if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() // this is a bit redundant (for good), since we're not sure if user validated the custom template with jarvis_chat_verify_template()
throw std::runtime_error("this custom template is not supported"); throw std::runtime_error("this custom template is not supported");
} else { } else {
// If the built-in template is not supported, we default to chatml // If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); res = jarvis_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true; fallback = true;
} }
} }
@ -1588,7 +1588,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
// if it turns out that our buffer is too small, we resize it // if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) { if ((size_t) res > buf.size()) {
buf.resize(res); buf.resize(res);
res = llama_chat_apply_template( res = jarvis_chat_apply_template(
fallback ? nullptr : model, fallback ? nullptr : model,
fallback ? "chatml" : ptr_tmpl, fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size()); chat.data(), chat.size(), add_ass, buf.data(), buf.size());
@ -1598,7 +1598,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
return formatted_chat; return formatted_chat;
} }
std::string common_chat_format_single(const struct llama_model * model, std::string common_chat_format_single(const struct jarvis_model * model,
const std::string & tmpl, const std::string & tmpl,
const std::vector<common_chat_msg> & past_msg, const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg, const common_chat_msg & new_msg,
@ -1618,7 +1618,7 @@ std::string common_chat_format_single(const struct llama_model * model,
return ss.str(); return ss.str();
} }
std::string common_chat_format_example(const struct llama_model * model, std::string common_chat_format_example(const struct jarvis_model * model,
const std::string & tmpl) { const std::string & tmpl) {
std::vector<common_chat_msg> msgs = { std::vector<common_chat_msg> msgs = {
{"system", "You are a helpful assistant"}, {"system", "You are a helpful assistant"},
@ -1633,14 +1633,14 @@ std::string common_chat_format_example(const struct llama_model * model,
// KV cache utils // KV cache utils
// //
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
llama_kv_cache_view_cell * c_curr = view.cells; jarvis_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences; jarvis_seq_id * cs_curr = view.cells_sequences;
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) { if (i % row_size == 0) {
@ -1656,15 +1656,15 @@ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
printf("\n=== Done dumping\n"); printf("\n=== Done dumping\n");
} }
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
std::unordered_map<llama_seq_id, size_t> seqs; std::unordered_map<jarvis_seq_id, size_t> seqs;
llama_kv_cache_view_cell * c_curr = view.cells; jarvis_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences; jarvis_seq_id * cs_curr = view.cells_sequences;
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
for (int j = 0; j < view.n_seq_max; j++) { for (int j = 0; j < view.n_seq_max; j++) {
@ -1949,12 +1949,12 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
} }
} }
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx, void yaml_dump_non_result_info(FILE * stream, const common_params & params, const jarvis_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) { const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
const auto & sparams = params.sparams; const auto & sparams = params.sparams;
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); fprintf(stream, "build_commit: %s\n", JARVIS_COMMIT);
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); fprintf(stream, "build_number: %d\n", JARVIS_BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
@ -1985,7 +1985,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
#endif // NDEBUG #endif // NDEBUG
fprintf(stream, "model_desc: %s\n", model_desc); fprintf(stream, "model_desc: %s\n", model_desc);
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", jarvis_n_vocab(jarvis_get_model(lctx)));
#ifdef __OPTIMIZE__ #ifdef __OPTIMIZE__
fprintf(stream, "optimize: true\n"); fprintf(stream, "optimize: true\n");
@ -2087,7 +2087,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + jarvis_max_devices());
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);

View file

@ -2,7 +2,7 @@
#pragma once #pragma once
#include "llama.h" #include "jarvis.h"
#include <string> #include <string>
#include <vector> #include <vector>
@ -18,8 +18,8 @@
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \ #define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ fprintf(stderr, "%s: build = %d (%s)\n", __func__, JARVIS_BUILD_NUMBER, JARVIS_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, JARVIS_COMPILER, JARVIS_BUILD_TARGET); \
} while(0) } while(0)
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
@ -30,14 +30,14 @@ struct common_lora_adapter_info {
}; };
struct common_lora_adapter_container : common_lora_adapter_info { struct common_lora_adapter_container : common_lora_adapter_info {
struct llama_lora_adapter * adapter; struct jarvis_lora_adapter * adapter;
}; };
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int JARVIS_BUILD_NUMBER;
extern char const * LLAMA_COMMIT; extern char const * JARVIS_COMMIT;
extern char const * LLAMA_COMPILER; extern char const * JARVIS_COMPILER;
extern char const * LLAMA_BUILD_TARGET; extern char const * JARVIS_BUILD_TARGET;
struct common_control_vector_load_info; struct common_control_vector_load_info;
@ -61,25 +61,25 @@ int32_t cpu_get_num_math();
// Common params // Common params
// //
enum llama_example { enum jarvis_example {
LLAMA_EXAMPLE_COMMON, JARVIS_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE, JARVIS_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN, JARVIS_EXAMPLE_MAIN,
LLAMA_EXAMPLE_INFILL, JARVIS_EXAMPLE_INFILL,
LLAMA_EXAMPLE_EMBEDDING, JARVIS_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY, JARVIS_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL, JARVIS_EXAMPLE_RETRIEVAL,
LLAMA_EXAMPLE_PASSKEY, JARVIS_EXAMPLE_PASSKEY,
LLAMA_EXAMPLE_IMATRIX, JARVIS_EXAMPLE_IMATRIX,
LLAMA_EXAMPLE_BENCH, JARVIS_EXAMPLE_BENCH,
LLAMA_EXAMPLE_SERVER, JARVIS_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR, JARVIS_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA, JARVIS_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA, JARVIS_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP, JARVIS_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL, JARVIS_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_COUNT, JARVIS_EXAMPLE_COUNT,
}; };
enum common_sampler_type { enum common_sampler_type {
@ -103,7 +103,7 @@ enum dimre_method {
// sampler parameters // sampler parameters
struct common_sampler_params { struct common_sampler_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler uint32_t seed = JARVIS_DEFAULT_SEED; // the seed used to initialize jarvis_sampler
int32_t n_prev = 64; // number of previous tokens to remember int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
@ -149,7 +149,7 @@ struct common_sampler_params {
std::string grammar; // optional BNF-like grammar to constrain sampling std::string grammar; // optional BNF-like grammar to constrain sampling
std::vector<llama_logit_bias> logit_bias; // logit biases to apply std::vector<jarvis_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string // print the parameters into a string
std::string print() const; std::string print() const;
@ -192,10 +192,10 @@ struct common_params {
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum jarvis_split_mode split_mode = JARVIS_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum jarvis_rope_scaling_type rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum jarvis_pooling_type pooling_type = JARVIS_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum jarvis_attention_type attention_type = JARVIS_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
struct common_sampler_params sparams; struct common_sampler_params sparams;
@ -219,9 +219,9 @@ struct common_params {
std::vector<std::string> in_files; // all input files std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides; std::vector<jarvis_model_kv_override> kv_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using jarvis_lora_adapter_apply)
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -377,15 +377,15 @@ bool set_process_priority(enum ggml_sched_priority prio);
#ifdef __GNUC__ #ifdef __GNUC__
#ifdef __MINGW32__ #ifdef __MINGW32__
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else #else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif #endif
#else #else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) #define JARVIS_COMMON_ATTRIBUTE_FORMAT(...)
#endif #endif
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) JARVIS_COMMON_ATTRIBUTE_FORMAT(1, 2)
std::string string_format(const char * fmt, ...); std::string string_format(const char * fmt, ...);
std::string string_strip(const std::string & str); std::string string_strip(const std::string & str);
@ -424,13 +424,13 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts; return parts;
} }
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
std::string string_from(bool value); std::string string_from(bool value);
std::string string_from(const std::vector<int> & values); std::string string_from(const std::vector<int> & values);
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens); std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens);
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch); std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch);
// //
// Filesystem utils // Filesystem utils
@ -447,32 +447,32 @@ std::string fs_get_cache_file(const std::string & filename);
// //
struct common_init_result { struct common_init_result {
struct llama_model * model = nullptr; struct jarvis_model * model = nullptr;
struct llama_context * context = nullptr; struct jarvis_context * context = nullptr;
std::vector<common_lora_adapter_container> lora_adapters; std::vector<common_lora_adapter_container> lora_adapters;
}; };
struct common_init_result common_init_from_params(common_params & params); struct common_init_result common_init_from_params(common_params & params);
struct llama_model_params common_model_params_to_llama (const common_params & params); struct jarvis_model_params common_model_params_to_jarvis (const common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params); struct jarvis_context_params common_context_params_to_jarvis(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct jarvis_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct jarvis_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
// clear LoRA adapters from context, then apply new list of adapters // clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters); void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
// Batch utils // Batch utils
void common_batch_clear(struct llama_batch & batch); void common_batch_clear(struct jarvis_batch & batch);
void common_batch_add( void common_batch_add(
struct llama_batch & batch, struct jarvis_batch & batch,
llama_token id, jarvis_token id,
llama_pos pos, jarvis_pos pos,
const std::vector<llama_seq_id> & seq_ids, const std::vector<jarvis_seq_id> & seq_ids,
bool logits); bool logits);
// //
@ -481,14 +481,14 @@ void common_batch_add(
// tokenizes a string into a vector of tokens // tokenizes a string into a vector of tokens
// should work similar to Python's `tokenizer.encode` // should work similar to Python's `tokenizer.encode`
std::vector<llama_token> common_tokenize( std::vector<jarvis_token> common_tokenize(
const struct llama_context * ctx, const struct jarvis_context * ctx,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false);
std::vector<llama_token> common_tokenize( std::vector<jarvis_token> common_tokenize(
const struct llama_model * model, const struct jarvis_model * model,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false);
@ -496,23 +496,23 @@ std::vector<llama_token> common_tokenize(
// tokenizes a token into a piece, optionally renders special/control tokens // tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece` // should work similar to Python's `tokenizer.id_to_piece`
std::string common_token_to_piece( std::string common_token_to_piece(
const struct llama_context * ctx, const struct jarvis_context * ctx,
llama_token token, jarvis_token token,
bool special = true); bool special = true);
// detokenizes a vector of tokens into a string // detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode` // should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens // optionally renders special/control tokens
std::string common_detokenize( std::string common_detokenize(
llama_context * ctx, jarvis_context * ctx,
const std::vector<llama_token> & tokens, const std::vector<jarvis_token> & tokens,
bool special = true); bool special = true);
// //
// Chat template utils // Chat template utils
// //
// same with llama_chat_message, but uses std::string // same with jarvis_chat_message, but uses std::string
struct common_chat_msg { struct common_chat_msg {
std::string role; std::string role;
std::string content; std::string content;
@ -521,23 +521,23 @@ struct common_chat_msg {
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl); bool common_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template // CPP wrapper for jarvis_chat_apply_template
// If the built-in template is not supported, we default to chatml // If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error // If the custom "tmpl" is not supported, we throw an error
std::string common_chat_apply_template(const struct llama_model * model, std::string common_chat_apply_template(const struct jarvis_model * model,
const std::string & tmpl, const std::string & tmpl,
const std::vector<common_chat_msg> & chat, const std::vector<common_chat_msg> & chat,
bool add_ass); bool add_ass);
// Format single message, while taking into account the position of that message in chat history // Format single message, while taking into account the position of that message in chat history
std::string common_chat_format_single(const struct llama_model * model, std::string common_chat_format_single(const struct jarvis_model * model,
const std::string & tmpl, const std::string & tmpl,
const std::vector<common_chat_msg> & past_msg, const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg, const common_chat_msg & new_msg,
bool add_ass); bool add_ass);
// Returns an example of formatted chat // Returns an example of formatted chat
std::string common_chat_format_example(const struct llama_model * model, std::string common_chat_format_example(const struct jarvis_model * model,
const std::string & tmpl); const std::string & tmpl);
// //
@ -545,10 +545,10 @@ std::string common_chat_format_example(const struct llama_model * model,
// //
// Dump the KV cache view with the number of sequences per cell. // Dump the KV cache view with the number of sequences per cell.
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output). // Dump the KV cache view showing individual sequences in each cell (long output).
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size = 40);
// //
// Embedding utils // Embedding utils
@ -596,5 +596,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
void yaml_dump_non_result_info( void yaml_dump_non_result_info(
FILE * stream, const common_params & params, const llama_context * lctx, FILE * stream, const common_params & params, const jarvis_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc); const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View file

@ -435,7 +435,7 @@ namespace console {
fputc('\n', out); fputc('\n', out);
has_more = !has_more; has_more = !has_more;
} else { } else {
// llama will just eat the single space, it won't act as a space // jarvis will just eat the single space, it won't act as a space
if (line.length() == 1 && line.back() == ' ') { if (line.length() == 1 && line.back() == ' ') {
line.clear(); line.clear();
pop_cursor(); pop_cursor();

View file

@ -5336,7 +5336,7 @@ template<typename IteratorType> class iteration_proxy
}; };
// Structured Bindings Support // Structured Bindings Support
// For further reference see https://blog.tartanllama.xyz/structured-bindings/ // For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391 // And see https://github.com/nlohmann/json/pull/1391
template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0> template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key()) auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
@ -5344,7 +5344,7 @@ auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decl
return i.key(); return i.key();
} }
// Structured Bindings Support // Structured Bindings Support
// For further reference see https://blog.tartanllama.xyz/structured-bindings/ // For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391 // And see https://github.com/nlohmann/json/pull/1391
template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0> template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value()) auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
@ -5357,7 +5357,7 @@ NLOHMANN_JSON_NAMESPACE_END
// The Addition to the STD Namespace is required to add // The Addition to the STD Namespace is required to add
// Structured Bindings Support to the iteration_proxy_value class // Structured Bindings Support to the iteration_proxy_value class
// For further reference see https://blog.tartanllama.xyz/structured-bindings/ // For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391 // And see https://github.com/nlohmann/json/pull/1391
namespace std namespace std
{ {

View file

@ -8,7 +8,7 @@
#include <thread> #include <thread>
#include <vector> #include <vector>
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA; int common_log_verbosity_thold = LOG_DEFAULT_JARVIS;
void common_log_set_verbosity_thold(int verbosity) { void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity; common_log_verbosity_thold = verbosity;

View file

@ -11,7 +11,7 @@
#endif #endif
#define LOG_DEFAULT_DEBUG 1 #define LOG_DEFAULT_DEBUG 1
#define LOG_DEFAULT_LLAMA 0 #define LOG_DEFAULT_JARVIS 0
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via common_log_set_verbosity() // set via common_log_set_verbosity()

View file

@ -9,7 +9,7 @@
#include <thread> #include <thread>
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) { std::vector<jarvis_token> & inp, int nnew, bool print_progress) {
const int64_t t_start_ms = ggml_time_ms(); const int64_t t_start_ms = ggml_time_ms();
const int64_t inp_size = inp.size(); const int64_t inp_size = inp.size();
@ -21,7 +21,7 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
for (int64_t i = i_start; i < inp_size; ++i) { for (int64_t i = i_start; i < inp_size; ++i) {
const int64_t ngram_start = i - ngram_size; const int64_t ngram_start = i - ngram_size;
common_ngram ngram(&inp[ngram_start], ngram_size); common_ngram ngram(&inp[ngram_start], ngram_size);
const llama_token token = inp[i]; const jarvis_token token = inp[i];
common_ngram_cache::iterator part_it = ngram_cache.find(ngram); common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
if (part_it == ngram_cache.end()) { if (part_it == ngram_cache.end()) {
@ -51,18 +51,18 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
} }
// Helper function to get a token from the combined, speculative sequence of inp and draft. // Helper function to get a token from the combined, speculative sequence of inp and draft.
static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) { static jarvis_token get_token(const std::vector<jarvis_token> & inp, const std::vector<jarvis_token> & draft, const size_t i) {
return i < inp.size() ? inp[i] : draft[1 + i - inp.size()]; return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
} }
// If sample size or percentage are below these thresholds the draft is aborted early: // If sample size or percentage are below these thresholds the draft is aborted early:
constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1}; constexpr int draft_min_sample_size_lax[JARVIS_NGRAM_MAX] = { 2, 2, 1, 1};
constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50}; constexpr int draft_min_percent_lax[JARVIS_NGRAM_MAX] = {66, 50, 50, 50};
constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; constexpr int draft_min_sample_size_strict[JARVIS_NGRAM_MAX] = { 4, 3, 2, 2};
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; constexpr int draft_min_percent_strict[JARVIS_NGRAM_MAX] = {75, 66, 66, 66};
// Helper function that tries to draft a token from only the static ngram cache: // Helper function that tries to draft a token from only the static ngram cache:
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { static jarvis_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) { if (part_static_it == nc_static.end()) {
return -1; return -1;
@ -71,10 +71,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
int max_count_static = 0; int max_count_static = 0;
int sum_count_static = 0; int sum_count_static = 0;
llama_token max_token = -1; jarvis_token max_token = -1;
for (std::pair<llama_token, int> token_count_static : part_static) { for (std::pair<jarvis_token, int> token_count_static : part_static) {
const llama_token token = token_count_static.first; const jarvis_token token = token_count_static.first;
const int32_t count_static = token_count_static.second; const int32_t count_static = token_count_static.second;
if (count_static > max_count_static) { if (count_static > max_count_static) {
@ -84,21 +84,21 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
sum_count_static += count_static; sum_count_static += count_static;
} }
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { if (sum_count_static < draft_min_sample_size_lax[JARVIS_NGRAM_STATIC-1]) {
return -1; return -1;
} }
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { if (100*max_count_static < draft_min_percent_lax[JARVIS_NGRAM_STATIC-1]*sum_count_static) {
return -1; return -1;
} }
return max_token; return max_token;
} }
// Try to draft a token from primary cache (context/dynamic), validate with static cache: // Try to draft a token from primary cache (context/dynamic), validate with static cache:
static llama_token try_draft( static jarvis_token try_draft(
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static, common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) { const int * min_sample_size, const int * min_percent) {
llama_token drafted_token = -1; jarvis_token drafted_token = -1;
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
const common_ngram ngram_primary = ngrams_primary[i]; const common_ngram ngram_primary = ngrams_primary[i];
@ -112,10 +112,10 @@ static llama_token try_draft(
int max_count_primary = 0; int max_count_primary = 0;
int max_count_static = 0; int max_count_static = 0;
int sum_count_primary = 0; int sum_count_primary = 0;
llama_token max_token = -1; jarvis_token max_token = -1;
for (std::pair<llama_token, int> token_count_primary : part_primary) { for (std::pair<jarvis_token, int> token_count_primary : part_primary) {
const llama_token token = token_count_primary.first; const jarvis_token token = token_count_primary.first;
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token); common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
@ -143,22 +143,22 @@ static llama_token try_draft(
} }
void common_ngram_cache_draft( void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
) { ) {
GGML_ASSERT(draft.size() == 1); GGML_ASSERT(draft.size() == 1);
const int inp_size = inp.size(); const int inp_size = inp.size();
if (inp_size < LLAMA_NGRAM_STATIC) { if (inp_size < JARVIS_NGRAM_STATIC) {
return; return;
} }
while ((int) draft.size()-1 < n_draft) { while ((int) draft.size()-1 < n_draft) {
llama_token drafted_token = -1; jarvis_token drafted_token = -1;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; const int ngram_start_static = inp_size-JARVIS_NGRAM_STATIC + draft.size()-1;
common_ngram ngram_static; common_ngram ngram_static;
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { for (int j = ngram_start_static; j < ngram_start_static + JARVIS_NGRAM_STATIC; ++j) {
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
} }
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
@ -207,12 +207,12 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram)); file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t)); file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
for (std::pair<llama_token, int32_t> item2 : token_counts) { for (std::pair<jarvis_token, int32_t> item2 : token_counts) {
const llama_token token = item2.first; const jarvis_token token = item2.first;
const int32_t count = item2.second; const int32_t count = item2.second;
GGML_ASSERT(count > 0); GGML_ASSERT(count > 0);
file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token)); file_out.write(reinterpret_cast<const char *>(&token), sizeof(jarvis_token));
file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t)); file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
} }
} }
@ -228,7 +228,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
common_ngram ngram; common_ngram ngram;
int32_t ntokens; int32_t ntokens;
llama_token token; jarvis_token token;
int32_t count; int32_t count;
char * ngramc = reinterpret_cast<char*>(&ngram); char * ngramc = reinterpret_cast<char*>(&ngram);
@ -243,7 +243,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
for (int i = 0; i < ntokens; ++i) { for (int i = 0; i < ntokens; ++i) {
GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token))); GGML_ASSERT(hashmap_file.read(tokenc, sizeof(jarvis_token)));
GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t))); GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
GGML_ASSERT(count > 0); GGML_ASSERT(count > 0);
@ -268,8 +268,8 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng
continue; continue;
} }
for (std::pair<llama_token, int32_t> token_count : part) { for (std::pair<jarvis_token, int32_t> token_count : part) {
const llama_token token = token_count.first; const jarvis_token token = token_count.first;
const int32_t count = token_count.second; const int32_t count = token_count.second;
GGML_ASSERT(count > 0); GGML_ASSERT(count > 0);

View file

@ -1,34 +1,34 @@
#pragma once #pragma once
#include "llama.h" #include "jarvis.h"
#include <unordered_map> #include <unordered_map>
#include <string> #include <string>
#include <vector> #include <vector>
#define LLAMA_NGRAM_MIN 1 #define JARVIS_NGRAM_MIN 1
#define LLAMA_NGRAM_MAX 4 #define JARVIS_NGRAM_MAX 4
#define LLAMA_NGRAM_STATIC 2 #define JARVIS_NGRAM_STATIC 2
// Data structures to map n-grams to empirical token probabilities: // Data structures to map n-grams to empirical token probabilities:
struct common_ngram { struct common_ngram {
llama_token tokens[LLAMA_NGRAM_MAX]; jarvis_token tokens[JARVIS_NGRAM_MAX];
common_ngram() { common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
tokens[i] = -1; tokens[i] = -1;
} }
} }
common_ngram(const llama_token * input, const int ngram_size) { common_ngram(const jarvis_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : -1; tokens[i] = i < ngram_size ? input[i] : -1;
} }
} }
bool operator==(const common_ngram & other) const { bool operator==(const common_ngram & other) const {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
if (tokens[i] != other.tokens[i]) { if (tokens[i] != other.tokens[i]) {
return false; return false;
} }
@ -38,7 +38,7 @@ struct common_ngram {
}; };
struct common_token_hash_function { struct common_token_hash_function {
size_t operator()(const llama_token token) const { size_t operator()(const jarvis_token token) const {
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
return token * 11400714819323198485llu; return token * 11400714819323198485llu;
} }
@ -47,7 +47,7 @@ struct common_token_hash_function {
struct common_ngram_hash_function { struct common_ngram_hash_function {
size_t operator()(const common_ngram & ngram) const { size_t operator()(const common_ngram & ngram) const {
size_t hash = common_token_hash_function{}(ngram.tokens[0]); size_t hash = common_token_hash_function{}(ngram.tokens[0]);
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 1; i < JARVIS_NGRAM_MAX; ++i) {
hash ^= common_token_hash_function{}(ngram.tokens[i]); hash ^= common_token_hash_function{}(ngram.tokens[i]);
} }
return hash; return hash;
@ -55,7 +55,7 @@ struct common_ngram_hash_function {
}; };
// token -> number of times token has been seen // token -> number of times token has been seen
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part; typedef std::unordered_map<jarvis_token, int32_t> common_ngram_cache_part;
// n-gram -> empirical distribution of following tokens // n-gram -> empirical distribution of following tokens
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache; typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
@ -71,7 +71,7 @@ typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_h
// In order to get correct results inp_data can ONLY BE APPENDED TO. // In order to get correct results inp_data can ONLY BE APPENDED TO.
// Changes in the middle need a complete rebuild. // Changes in the middle need a complete rebuild.
void common_ngram_cache_update( void common_ngram_cache_update(
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress); common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<jarvis_token> & inp_data, int nnew, bool print_progress);
// Try to draft tokens from ngram caches. // Try to draft tokens from ngram caches.
// inp: the tokens generated so far. // inp: the tokens generated so far.
@ -82,7 +82,7 @@ void common_ngram_cache_update(
// nc_dynamic: ngram cache based on previous user generations. // nc_dynamic: ngram cache based on previous user generations.
// nc_static: ngram cache generated from a large text corpus, used for validation. // nc_static: ngram cache generated from a large text corpus, used for validation.
void common_ngram_cache_draft( void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static); common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
// Save an ngram cache to a file. // Save an ngram cache to a file.

View file

@ -6,7 +6,7 @@
#include <unordered_map> #include <unordered_map>
// the ring buffer works similarly to std::deque, but with a fixed capacity // the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h // TODO: deduplicate with jarvis-impl.h
template<typename T> template<typename T>
struct ring_buffer { struct ring_buffer {
ring_buffer(size_t cap) : capacity(cap), data(cap) {} ring_buffer(size_t cap) : capacity(cap), data(cap) {}
@ -101,24 +101,24 @@ struct ring_buffer {
struct common_sampler { struct common_sampler {
common_sampler_params params; common_sampler_params params;
struct llama_sampler * grmr; struct jarvis_sampler * grmr;
struct llama_sampler * chain; struct jarvis_sampler * chain;
ring_buffer<llama_token> prev; ring_buffer<jarvis_token> prev;
std::vector<llama_token_data> cur; std::vector<jarvis_token_data> cur;
llama_token_data_array cur_p; jarvis_token_data_array cur_p;
void set_logits(struct llama_context * ctx, int idx) { void set_logits(struct jarvis_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx); const auto * logits = jarvis_get_logits_ith(ctx, idx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
cur.resize(n_vocab); cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (jarvis_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; cur[token_id] = jarvis_token_data{token_id, logits[token_id], 0.0f};
} }
cur_p = { cur.data(), cur.size(), -1, false }; cur_p = { cur.data(), cur.size(), -1, false };
@ -141,31 +141,31 @@ std::string common_sampler_params::print() const {
return std::string(result); return std::string(result);
} }
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); jarvis_sampler_chain_params lparams = jarvis_sampler_chain_default_params();
lparams.no_perf = params.no_perf; lparams.no_perf = params.no_perf;
auto * result = new common_sampler { auto * result = new common_sampler {
/* .params = */ params, /* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), /* .grmr = */ jarvis_sampler_init_grammar(model, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams), /* .chain = */ jarvis_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)), /* .prev = */ ring_buffer<jarvis_token>(std::max(32, params.n_prev)),
/* .cur = */ {}, /* .cur = */ {},
/* .cur_p = */ {}, /* .cur_p = */ {},
}; };
llama_sampler_chain_add(result->chain, jarvis_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias( jarvis_sampler_init_logit_bias(
llama_n_vocab(model), jarvis_n_vocab(model),
params.logit_bias.size(), params.logit_bias.size(),
params.logit_bias.data())); params.logit_bias.data()));
llama_sampler_chain_add(result->chain, jarvis_sampler_chain_add(result->chain,
llama_sampler_init_penalties( jarvis_sampler_init_penalties(
llama_n_vocab (model), jarvis_n_vocab (model),
llama_token_eos(model), jarvis_token_eos(model),
llama_token_nl (model), jarvis_token_nl (model),
params.penalty_last_n, params.penalty_last_n,
params.penalty_repeat, params.penalty_repeat,
params.penalty_freq, params.penalty_freq,
@ -184,44 +184,44 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
c_breakers.push_back(str.c_str()); c_breakers.push_back(str.c_str());
} }
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
} }
break; break;
case COMMON_SAMPLER_TYPE_TOP_K: case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_k (params.top_k));
break; break;
case COMMON_SAMPLER_TYPE_TOP_P: case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_p (params.top_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_MIN_P: case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_min_p (params.min_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_XTC: case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break; break;
case COMMON_SAMPLER_TYPE_TFS_Z: case COMMON_SAMPLER_TYPE_TFS_Z:
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_tail_free(params.tfs_z, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_TYPICAL_P: case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_typical (params.typ_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_TEMPERATURE: case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break; break;
case COMMON_SAMPLER_TYPE_INFILL: case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_infill (model));
break; break;
default: default:
GGML_ASSERT(false && "unknown sampler type"); GGML_ASSERT(false && "unknown sampler type");
} }
} }
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) { } else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat(jarvis_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) { } else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
} else { } else {
GGML_ASSERT(false && "unknown mirostat version"); GGML_ASSERT(false && "unknown mirostat version");
} }
@ -231,53 +231,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
void common_sampler_free(struct common_sampler * gsmpl) { void common_sampler_free(struct common_sampler * gsmpl) {
if (gsmpl) { if (gsmpl) {
llama_sampler_free(gsmpl->grmr); jarvis_sampler_free(gsmpl->grmr);
llama_sampler_free(gsmpl->chain); jarvis_sampler_free(gsmpl->chain);
delete gsmpl; delete gsmpl;
} }
} }
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar) {
if (accept_grammar) { if (accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token); jarvis_sampler_accept(gsmpl->grmr, token);
} }
llama_sampler_accept(gsmpl->chain, token); jarvis_sampler_accept(gsmpl->chain, token);
gsmpl->prev.push_back(token); gsmpl->prev.push_back(token);
} }
void common_sampler_reset(struct common_sampler * gsmpl) { void common_sampler_reset(struct common_sampler * gsmpl) {
llama_sampler_reset(gsmpl->grmr); jarvis_sampler_reset(gsmpl->grmr);
llama_sampler_reset(gsmpl->chain); jarvis_sampler_reset(gsmpl->chain);
} }
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler { return new common_sampler {
/* .params = */ gsmpl->params, /* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr), /* .grmr = */ jarvis_sampler_clone(gsmpl->grmr),
/* .chain = */ llama_sampler_clone(gsmpl->chain), /* .chain = */ jarvis_sampler_clone(gsmpl->chain),
/* .prev = */ gsmpl->prev, /* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur, /* .cur = */ gsmpl->cur,
/* .cur_p = */ gsmpl->cur_p, /* .cur_p = */ gsmpl->cur_p,
}; };
} }
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl) {
// TODO: measure grammar performance // TODO: measure grammar performance
if (gsmpl) { if (gsmpl) {
llama_perf_sampler_print(gsmpl->chain); jarvis_perf_sampler_print(gsmpl->chain);
} }
if (ctx) { if (ctx) {
llama_perf_context_print(ctx); jarvis_perf_context_print(ctx);
} }
} }
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first) {
gsmpl->set_logits(ctx, idx); gsmpl->set_logits(ctx, idx);
auto & grmr = gsmpl->grmr; auto & grmr = gsmpl->grmr;
@ -285,14 +285,14 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
auto & cur_p = gsmpl->cur_p; // initialized by set_logits auto & cur_p = gsmpl->cur_p; // initialized by set_logits
if (grammar_first) { if (grammar_first) {
llama_sampler_apply(grmr, &cur_p); jarvis_sampler_apply(grmr, &cur_p);
} }
llama_sampler_apply(chain, &cur_p); jarvis_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
const llama_token id = cur_p.data[cur_p.selected].id; const jarvis_token id = cur_p.data[cur_p.selected].id;
if (grammar_first) { if (grammar_first) {
return id; return id;
@ -300,10 +300,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
// check if it the sampled token fits the grammar // check if it the sampled token fits the grammar
{ {
llama_token_data single_token_data = { id, 1.0f, 0.0f }; jarvis_token_data single_token_data = { id, 1.0f, 0.0f };
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; jarvis_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
llama_sampler_apply(grmr, &single_token_data_array); jarvis_sampler_apply(grmr, &single_token_data_array);
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY; const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
if (is_valid) { if (is_valid) {
@ -315,8 +315,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx); gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p); jarvis_sampler_apply(grmr, &cur_p);
llama_sampler_apply(chain, &cur_p); jarvis_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration"); GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
@ -324,31 +324,31 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
} }
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
return llama_sampler_get_seed(gsmpl->chain); return jarvis_sampler_get_seed(gsmpl->chain);
} }
// helpers // helpers
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) { jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
return &gsmpl->cur_p; return &gsmpl->cur_p;
} }
llama_token common_sampler_last(const struct common_sampler * gsmpl) { jarvis_token common_sampler_last(const struct common_sampler * gsmpl) {
return gsmpl->prev.rat(0); return gsmpl->prev.rat(0);
} }
std::string common_sampler_print(const struct common_sampler * gsmpl) { std::string common_sampler_print(const struct common_sampler * gsmpl) {
std::string result = "logits "; std::string result = "logits ";
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { for (int i = 0; i < jarvis_sampler_chain_n(gsmpl->chain); i++) {
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); const auto * smpl = jarvis_sampler_chain_get(gsmpl->chain, i);
result += std::string("-> ") + llama_sampler_name(smpl) + " "; result += std::string("-> ") + jarvis_sampler_name(smpl) + " ";
} }
return result; return result;
} }
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) { std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx_main, int n) {
n = std::min(n, (int) gsmpl->prev.size()); n = std::min(n, (int) gsmpl->prev.size());
if (n <= 0) { if (n <= 0) {
@ -359,9 +359,9 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
for (int i = n - 1; i >= 0; i--) { for (int i = n - 1; i >= 0; i--) {
const llama_token id = gsmpl->prev.rat(i); const jarvis_token id = gsmpl->prev.rat(i);
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen"); GGML_ASSERT(id != JARVIS_TOKEN_NULL && "null token in the sampling history - should not happen");
result += common_token_to_piece(ctx_main, id); result += common_token_to_piece(ctx_main, id);
} }

View file

@ -1,13 +1,13 @@
#pragma once #pragma once
#include "llama.h" #include "jarvis.h"
#include "common.h" #include "common.h"
#include <string> #include <string>
#include <vector> #include <vector>
// common_sampler extends llama_sampler with additional functionality: // common_sampler extends jarvis_sampler with additional functionality:
// //
// - grammar support // - grammar support
// - custom sampler logic based on the parameters // - custom sampler logic based on the parameters
@ -24,7 +24,7 @@
// grammar constraints are applied to the full vocabulary and the token is resampled. // grammar constraints are applied to the full vocabulary and the token is resampled.
// //
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can // The common_sampler also maintains a container with the last accepted tokens. In the future, this can
// be moved into the core llama library. // be moved into the core jarvis library.
// //
// For convenience, the common_sampler also maintains a container with the current candidate tokens. // For convenience, the common_sampler also maintains a container with the current candidate tokens.
// This can be used to access the probabilities of the rest of the non-sampled tokens. // This can be used to access the probabilities of the rest of the non-sampled tokens.
@ -34,19 +34,19 @@
struct common_sampler; struct common_sampler;
// llama_sampler API overloads // jarvis_sampler API overloads
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params);
void common_sampler_free(struct common_sampler * gsmpl); void common_sampler_free(struct common_sampler * gsmpl);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar); void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar);
void common_sampler_reset (struct common_sampler * gsmpl); void common_sampler_reset (struct common_sampler * gsmpl);
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// arguments can be nullptr to skip printing // arguments can be nullptr to skip printing
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl);
// extended sampling implementation: // extended sampling implementation:
// //
@ -58,23 +58,23 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
// if grammar_first is true, the grammar is applied before the samplers (slower) // if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
// //
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first = false);
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
// helpers // helpers
// access the internal list of current candidate tokens // access the internal list of current candidate tokens
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl); jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
// get the last accepted token // get the last accepted token
llama_token common_sampler_last(const struct common_sampler * gsmpl); jarvis_token common_sampler_last(const struct common_sampler * gsmpl);
// print the sampler chain into a string // print the sampler chain into a string
std::string common_sampler_print(const struct common_sampler * gsmpl); std::string common_sampler_print(const struct common_sampler * gsmpl);
// get a string representation of the last accepted tokens // get a string representation of the last accepted tokens
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n); std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx, int n);
char common_sampler_type_to_chr(enum common_sampler_type cnstr); char common_sampler_type_to_chr(enum common_sampler_type cnstr);
std::string common_sampler_type_to_str(enum common_sampler_type cnstr); std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

View file

@ -34,7 +34,7 @@ struct train_state * init_train_state() {
state->opt = new struct ggml_opt_context; state->opt = new struct ggml_opt_context;
state->opt->ctx = NULL; state->opt->ctx = NULL;
state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; state->opt->params.graph_size = JARVIS_TRAIN_MAX_NODES;
state->opt->loss_after = 0.0f; state->opt->loss_after = 0.0f;
return state; return state;
@ -213,7 +213,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
} }
int64_t get_example_targets_batch( int64_t get_example_targets_batch(
struct llama_context * lctx, struct jarvis_context * lctx,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
struct ggml_tensor * target_probs, struct ggml_tensor * target_probs,
int64_t example_id, int64_t example_id,
@ -221,7 +221,7 @@ int64_t get_example_targets_batch(
const size_t * samples_begin, const size_t * samples_begin,
const size_t * samples_size, const size_t * samples_size,
size_t samples_count, size_t samples_count,
const llama_token * train_data, const jarvis_token * train_data,
size_t n_train_data, size_t n_train_data,
bool separate_with_eos, bool separate_with_eos,
bool separate_with_bos, bool separate_with_bos,
@ -241,8 +241,8 @@ int64_t get_example_targets_batch(
int64_t used_samples = 0; int64_t used_samples = 0;
ggml_set_f32(target_probs, 0.0f); ggml_set_f32(target_probs, 0.0f);
llama_token bos = llama_token_bos(llama_get_model(lctx)); jarvis_token bos = jarvis_token_bos(jarvis_get_model(lctx));
llama_token eos = llama_token_eos(llama_get_model(lctx)); jarvis_token eos = jarvis_token_eos(jarvis_get_model(lctx));
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
for (int k=0; k<n_batch; ++k) { for (int k=0; k<n_batch; ++k) {
// printf("%s: batch %d\n", __func__, k); // printf("%s: batch %d\n", __func__, k);
@ -259,7 +259,7 @@ int64_t get_example_targets_batch(
bool sample_separation_eos = !separate_with_eos; bool sample_separation_eos = !separate_with_eos;
bool sample_separation_bos = !separate_with_bos; bool sample_separation_bos = !separate_with_bos;
for (int64_t i=0; i<n_tokens; ++i) { for (int64_t i=0; i<n_tokens; ++i) {
llama_token token = eos; jarvis_token token = eos;
if (sample_offs >= sample_size && fill_with_next_samples) { if (sample_offs >= sample_size && fill_with_next_samples) {
if (!sample_separation_eos) { if (!sample_separation_eos) {
// insert eos token to separate samples // insert eos token to separate samples
@ -281,7 +281,7 @@ int64_t get_example_targets_batch(
} }
// note: no else-if here // note: no else-if here
if (sample_offs < sample_size) { if (sample_offs < sample_size) {
token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1)); token = clamp(train_data[sample_begin+sample_offs], 0, (jarvis_token) (n_vocab - 1));
++sample_offs; ++sample_offs;
} }
ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f); ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f);
@ -712,12 +712,12 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
} }
struct llama_file { struct jarvis_file {
// use FILE * so we don't have to re-open the file to mmap // use FILE * so we don't have to re-open the file to mmap
FILE * fp; FILE * fp;
size_t size; size_t size;
llama_file(const char * fname, const char * mode) { jarvis_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode); fp = std::fopen(fname, mode);
if (fp == NULL) { if (fp == NULL) {
size = 0; size = 0;
@ -788,7 +788,7 @@ struct llama_file {
write_raw(&val, sizeof(val)); write_raw(&val, sizeof(val));
} }
~llama_file() { ~jarvis_file() {
if (fp) { if (fp) {
std::fclose(fp); std::fclose(fp);
} }
@ -823,16 +823,16 @@ static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nu
} }
size_t tokenize_file( size_t tokenize_file(
struct llama_context * lctx, struct jarvis_context * lctx,
const char * filename, const char * filename,
const std::string & sample_start, const std::string & sample_start,
bool include_sample_start, bool include_sample_start,
bool overlapping_samples, bool overlapping_samples,
unsigned context_length, unsigned context_length,
std::vector<llama_token> & out_tokens, std::vector<jarvis_token> & out_tokens,
std::vector<size_t> & out_samples_begin, std::vector<size_t> & out_samples_begin,
std::vector<size_t> & out_samples_size) { std::vector<size_t> & out_samples_size) {
struct llama_file f(filename, "rb"); struct jarvis_file f(filename, "rb");
if (f.size == 0) { if (f.size == 0) {
out_tokens.clear(); out_tokens.clear();
@ -844,7 +844,7 @@ size_t tokenize_file(
} }
// account for possible leading whitespace that will be added by tokenizer // account for possible leading whitespace that will be added by tokenizer
// e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12] // e.g. '\t' will be tokenized by jarvis spm tokenizer to [29871, 12]
const int n_max_tokens_overhead = 1; const int n_max_tokens_overhead = 1;
std::vector<char> buf; std::vector<char> buf;
@ -862,8 +862,8 @@ size_t tokenize_file(
// tokenize all data at once // tokenize all data at once
out_tokens.resize(buf.size() + n_max_tokens_overhead); out_tokens.resize(buf.size() + n_max_tokens_overhead);
int n_tokens = llama_tokenize( int n_tokens = jarvis_tokenize(
llama_get_model(lctx), jarvis_get_model(lctx),
buf.data(), buf.data(),
(int) buf.size(), (int) buf.size(),
out_tokens.data(), out_tokens.data(),
@ -871,8 +871,8 @@ size_t tokenize_file(
false, false); false, false);
if (n_tokens < 0) { if (n_tokens < 0) {
out_tokens.resize(-n_tokens); out_tokens.resize(-n_tokens);
n_tokens = llama_tokenize( n_tokens = jarvis_tokenize(
llama_get_model(lctx), jarvis_get_model(lctx),
buf.data(), buf.data(),
(int) buf.size(), (int) buf.size(),
out_tokens.data(), out_tokens.data(),
@ -915,7 +915,7 @@ size_t tokenize_file(
out_samples_size.resize(out_samples_begin.size(), 0); out_samples_size.resize(out_samples_begin.size(), 0);
std::vector<char> buf_sample; std::vector<char> buf_sample;
std::vector<llama_token> tok_sample; std::vector<jarvis_token> tok_sample;
const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size()); const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
size_t found_too_big_sample = 0; size_t found_too_big_sample = 0;
@ -925,11 +925,11 @@ size_t tokenize_file(
size_t found_max_sample_size = 0; size_t found_max_sample_size = 0;
size_t max_token_text_size = 0; size_t max_token_text_size = 0;
int n_vocab = llama_n_vocab(llama_get_model(lctx)); int n_vocab = jarvis_n_vocab(jarvis_get_model(lctx));
for (llama_token token=0; token < n_vocab; ++token) { for (jarvis_token token=0; token < n_vocab; ++token) {
max_token_text_size = std::max( max_token_text_size = std::max(
max_token_text_size, max_token_text_size,
strlen(llama_token_get_text(llama_get_model(lctx), token))); strlen(jarvis_token_get_text(jarvis_get_model(lctx), token)));
} }
// upper bound of context byte length. // upper bound of context byte length.
@ -957,7 +957,7 @@ size_t tokenize_file(
} }
if (sample_size > 0) { if (sample_size > 0) {
// llama_tokenize expects zero terminated string, // jarvis_tokenize expects zero terminated string,
// copy sample into buffer and zero terminate it. // copy sample into buffer and zero terminate it.
buf_sample.resize(sample_size); buf_sample.resize(sample_size);
memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size); memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
@ -966,7 +966,7 @@ size_t tokenize_file(
// tokenize the sample // tokenize the sample
tok_sample.resize(buf_sample.size() + n_max_tokens_overhead); tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
int n_tokens = llama_tokenize(llama_get_model(lctx), int n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
buf_sample.data(), buf_sample.data(),
(int) buf_sample.size(), (int) buf_sample.size(),
tok_sample.data(), tok_sample.data(),
@ -974,7 +974,7 @@ size_t tokenize_file(
false, false); false, false);
if (n_tokens < 0) { if (n_tokens < 0) {
tok_sample.resize(-n_tokens); tok_sample.resize(-n_tokens);
n_tokens = llama_tokenize(llama_get_model(lctx), n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
buf_sample.data(), buf_sample.data(),
(int) buf_sample.size(), (int) buf_sample.size(),
tok_sample.data(), tok_sample.data(),
@ -1365,7 +1365,7 @@ bool consume_common_train_arg(
*invalid_param = true; *invalid_param = true;
return true; return true;
} }
if (llama_supports_gpu_offload()) { if (jarvis_supports_gpu_offload()) {
params->n_gpu_layers = std::stoi(argv[i]); params->n_gpu_layers = std::stoi(argv[i]);
} else { } else {
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");

View file

@ -7,9 +7,9 @@
#include <vector> #include <vector>
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "jarvis.h"
#define LLAMA_TRAIN_MAX_NODES 16384 #define JARVIS_TRAIN_MAX_NODES 16384
typedef std::string mt19937_state; typedef std::string mt19937_state;
@ -92,9 +92,9 @@ struct train_opt_callback_data {
struct train_state * train; struct train_state * train;
save_train_files_callback save_cb; save_train_files_callback save_cb;
void * save_data; void * save_data;
struct llama_context * lctx; struct jarvis_context * lctx;
int last_save_iter; int last_save_iter;
llama_token * tokens_data; jarvis_token * tokens_data;
size_t tokens_size; size_t tokens_size;
size_t * samples_begin; size_t * samples_begin;
size_t * samples_size; size_t * samples_size;
@ -146,18 +146,18 @@ void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3); void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
size_t tokenize_file( size_t tokenize_file(
struct llama_context * lctx, struct jarvis_context * lctx,
const char * filename, const char * filename,
const std::string & sample_start, const std::string & sample_start,
bool include_sample_start, bool include_sample_start,
bool overlapping_samples, bool overlapping_samples,
unsigned context_length, unsigned context_length,
std::vector<llama_token> & out_tokens, std::vector<jarvis_token> & out_tokens,
std::vector<size_t> & out_samples_begin, std::vector<size_t> & out_samples_begin,
std::vector<size_t> & out_samples_size); std::vector<size_t> & out_samples_size);
int64_t get_example_targets_batch( int64_t get_example_targets_batch(
struct llama_context * lctx, struct jarvis_context * lctx,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
struct ggml_tensor * target_probs, struct ggml_tensor * target_probs,
int64_t example_id, int64_t example_id,
@ -165,7 +165,7 @@ int64_t get_example_targets_batch(
const size_t * samples_begin, const size_t * samples_begin,
const size_t * samples_size, const size_t * samples_size,
size_t samples_count, size_t samples_count,
const llama_token * train_data, const jarvis_token * train_data,
size_t n_train_data, size_t n_train_data,
bool separate_with_eos, bool separate_with_eos,
bool separate_with_bos, bool separate_with_bos,

View file

@ -49,7 +49,7 @@ class Model:
_model_classes: dict[str, type[Model]] = {} _model_classes: dict[str, type[Model]] = {}
dir_model: Path dir_model: Path
ftype: gguf.LlamaFileType ftype: gguf.JarvisFileType
fname_out: Path fname_out: Path
is_big_endian: bool is_big_endian: bool
endianess: gguf.GGUFEndian endianess: gguf.GGUFEndian
@ -69,7 +69,7 @@ class Model:
# subclasses should define this! # subclasses should define this!
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, def __init__(self, dir_model: Path, ftype: gguf.JarvisFileType, fname_out: Path, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False, use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None, metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@ -96,15 +96,15 @@ class Model:
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
if self.ftype == gguf.LlamaFileType.GUESSED: if self.ftype == gguf.JarvisFileType.GUESSED:
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
_, first_tensor = next(self.get_tensors()) _, first_tensor = next(self.get_tensors())
if first_tensor.dtype == torch.float16: if first_tensor.dtype == torch.float16:
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
self.ftype = gguf.LlamaFileType.MOSTLY_F16 self.ftype = gguf.JarvisFileType.MOSTLY_F16
else: else:
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
self.ftype = gguf.LlamaFileType.MOSTLY_BF16 self.ftype = gguf.JarvisFileType.MOSTLY_BF16
# Configure GGUF Writer # Configure GGUF Writer
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
@ -308,7 +308,7 @@ class Model:
if n_dims <= 1 or new_name.endswith("_norm.weight"): if n_dims <= 1 or new_name.endswith("_norm.weight"):
data_qtype = gguf.GGMLQuantizationType.F32 data_qtype = gguf.GGMLQuantizationType.F32
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp # Conditions should closely match those in jarvis_model_quantize_internal in jarvis.cpp
# Some tensor types are always in float32 # Some tensor types are always in float32
if data_qtype is False and ( if data_qtype is False and (
any( any(
@ -337,25 +337,25 @@ class Model:
) )
): ):
if self.ftype in ( if self.ftype in (
gguf.LlamaFileType.MOSTLY_TQ1_0, gguf.JarvisFileType.MOSTLY_TQ1_0,
gguf.LlamaFileType.MOSTLY_TQ2_0, gguf.JarvisFileType.MOSTLY_TQ2_0,
): ):
# TODO: use Q4_K and Q6_K # TODO: use Q4_K and Q6_K
data_qtype = gguf.GGMLQuantizationType.F16 data_qtype = gguf.GGMLQuantizationType.F16
# No override (data_qtype is False), or wants to be quantized (data_qtype is True) # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
if isinstance(data_qtype, bool): if isinstance(data_qtype, bool):
if self.ftype == gguf.LlamaFileType.ALL_F32: if self.ftype == gguf.JarvisFileType.ALL_F32:
data_qtype = gguf.GGMLQuantizationType.F32 data_qtype = gguf.GGMLQuantizationType.F32
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: elif self.ftype == gguf.JarvisFileType.MOSTLY_F16:
data_qtype = gguf.GGMLQuantizationType.F16 data_qtype = gguf.GGMLQuantizationType.F16
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: elif self.ftype == gguf.JarvisFileType.MOSTLY_BF16:
data_qtype = gguf.GGMLQuantizationType.BF16 data_qtype = gguf.GGMLQuantizationType.BF16
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: elif self.ftype == gguf.JarvisFileType.MOSTLY_Q8_0:
data_qtype = gguf.GGMLQuantizationType.Q8_0 data_qtype = gguf.GGMLQuantizationType.Q8_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ1_0:
data_qtype = gguf.GGMLQuantizationType.TQ1_0 data_qtype = gguf.GGMLQuantizationType.TQ1_0
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ2_0:
data_qtype = gguf.GGMLQuantizationType.TQ2_0 data_qtype = gguf.GGMLQuantizationType.TQ2_0
else: else:
raise ValueError(f"Unknown file type: {self.ftype.name}") raise ValueError(f"Unknown file type: {self.ftype.name}")
@ -394,7 +394,7 @@ class Model:
if self.metadata.size_label is None and total_params > 0: if self.metadata.size_label is None and total_params > 0:
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' # Extract the encoding scheme from the file type name. e.g. 'gguf.JarvisFileType.MOSTLY_Q8_0' --> 'Q8_0'
output_type: str = self.ftype.name.partition("_")[2] output_type: str = self.ftype.name.partition("_")[2]
# Filename Output # Filename Output
@ -537,13 +537,13 @@ class Model:
# NOTE: this function is generated by convert_hf_to_gguf_update.py # NOTE: this function is generated by convert_hf_to_gguf_update.py
# do not modify it manually! # do not modify it manually!
# ref: https://github.com/ggerganov/llama.cpp/pull/6920 # ref: https://github.com/ggerganov/jarvis.cpp/pull/6920
# Marker: Start get_vocab_base_pre # Marker: Start get_vocab_base_pre
def get_vocab_base_pre(self, tokenizer) -> str: def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model # is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer # use in jarvis.cpp to implement the same pre-tokenizer
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@ -559,8 +559,8 @@ class Model:
# or pull the latest version of the model from Huggingface # or pull the latest version of the model from Huggingface
# don't edit the hashes manually! # don't edit the hashes manually!
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B # ref: https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B
res = "llama-bpe" res = "jarvis-bpe"
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
res = "deepseek-llm" res = "deepseek-llm"
@ -616,7 +616,7 @@ class Model:
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de" res = "jina-v2-de"
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct # ref: https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct
res = "smaug-bpe" res = "smaug-bpe"
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
@ -666,7 +666,7 @@ class Model:
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream") logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("** ref: https://github.com/ggerganov/jarvis.cpp/pull/6920")
logger.warning("**") logger.warning("**")
logger.warning(f"** chkhsh: {chkhsh}") logger.warning(f"** chkhsh: {chkhsh}")
logger.warning("**************************************************************************************") logger.warning("**************************************************************************************")
@ -746,7 +746,7 @@ class Model:
def _set_vocab_sentencepiece(self, add_to_gguf=True): def _set_vocab_sentencepiece(self, add_to_gguf=True):
tokens, scores, toktypes = self._create_vocab_sentencepiece() tokens, scores, toktypes = self._create_vocab_sentencepiece()
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("jarvis")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
@ -835,8 +835,8 @@ class Model:
return tokens, scores, toktypes return tokens, scores, toktypes
def _set_vocab_llama_hf(self): def _set_vocab_jarvis_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model) vocab = gguf.JarvisHfVocab(self.dir_model)
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
@ -848,7 +848,7 @@ class Model:
assert len(tokens) == vocab.vocab_size assert len(tokens) == vocab.vocab_size
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("jarvis")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
@ -857,7 +857,7 @@ class Model:
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "jarvis-spm"], vocab_size: int):
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
vocab_reader = gguf.GGUFReader(tokenizer_path, "r") vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
@ -875,7 +875,7 @@ class Model:
assert field # token list assert field # token list
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
if model_name == "llama-spm": if model_name == "jarvis-spm":
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
assert field # token scores assert field # token scores
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
@ -884,7 +884,7 @@ class Model:
assert field # token types assert field # token types
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
if model_name != "llama-spm": if model_name != "jarvis-spm":
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
assert field # token merges assert field # token merges
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
@ -1226,7 +1226,7 @@ class XverseModel(Model):
tokens.append(token_text) tokens.append(token_text)
toktypes.append(toktype) toktypes.append(toktype)
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("jarvis")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
@ -1515,21 +1515,21 @@ class StableLMModel(Model):
raise ValueError(f"Unprocessed norms: {norms}") raise ValueError(f"Unprocessed norms: {norms}")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") @Model.register("LLaMAForCausalLM", "JarvisForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
class LlamaModel(Model): class JarvisModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA model_arch = gguf.MODEL_ARCH.JARVIS
def set_vocab(self): def set_vocab(self):
try: try:
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
except FileNotFoundError: except FileNotFoundError:
try: try:
self._set_vocab_llama_hf() self._set_vocab_jarvis_hf()
except (FileNotFoundError, TypeError): except (FileNotFoundError, TypeError):
# Llama 3 # Jarvis 3
self._set_vocab_gpt2() self._set_vocab_gpt2()
# Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) # Apply to CodeJarvis only (and ignore for Jarvis 3 with a vocab size of 128256)
if self.hparams.get("vocab_size", 32000) == 32016: if self.hparams.get("vocab_size", 32000) == 32016:
special_vocab = gguf.SpecialVocab( special_vocab = gguf.SpecialVocab(
self.dir_model, load_merges=False, self.dir_model, load_merges=False,
@ -1583,9 +1583,9 @@ class LlamaModel(Model):
n_kv_head = self.hparams.get("num_key_value_heads") n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith(("q_proj.weight", "q_proj.bias")): if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head) data_torch = JarvisModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")): if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately # process the experts separately
if name.find("block_sparse_moe.experts") != -1: if name.find("block_sparse_moe.experts") != -1:
@ -1625,7 +1625,7 @@ class LlamaModel(Model):
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3": if rope_scaling.get("rope_type", '').lower() == "jarvis3":
base = self.hparams.get("rope_theta", 10000.0) base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
@ -1793,7 +1793,7 @@ class DbrxModel(Model):
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
# But llama.cpp moe graph works differently # But jarvis.cpp moe graph works differently
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
@ -1842,7 +1842,7 @@ class MiniCPMModel(Model):
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
def set_vocab(self): def set_vocab(self):
self._set_vocab_llama_hf() self._set_vocab_jarvis_hf()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head: if n_kv_head is not None and n_head != n_kv_head:
@ -2188,7 +2188,7 @@ class Phi3MiniModel(Model):
if foken_data.get("special"): if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL toktypes[token_id] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("jarvis")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
@ -2456,7 +2456,7 @@ class InternLM2Model(Model):
if foken_data.get("special"): if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL toktypes[token_id] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("jarvis")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
@ -2468,7 +2468,7 @@ class InternLM2Model(Model):
if chat_eos_token_id is not None: if chat_eos_token_id is not None:
# For the chat model, we replace the eos with '<|im_end|>'. # For the chat model, we replace the eos with '<|im_end|>'.
# TODO: this is a hack, should be fixed # TODO: this is a hack, should be fixed
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 # https://github.com/ggerganov/jarvis.cpp/pull/6745#issuecomment-2067687048
special_vocab.special_token_ids["eos"] = chat_eos_token_id special_vocab.special_token_ids["eos"] = chat_eos_token_id
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
" in chat mode so that the conversation can end normally.") " in chat mode so that the conversation can end normally.")
@ -2505,8 +2505,8 @@ class InternLM2Model(Model):
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
# The model weights of q and k equire additional reshape. # The model weights of q and k equire additional reshape.
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) q = JarvisModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) k = JarvisModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
v = v.reshape((-1, v.shape[-1])) v = v.reshape((-1, v.shape[-1]))
return [ return [
@ -2769,7 +2769,7 @@ class GemmaModel(Model):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model # lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
# To prevent errors, skip loading lm_head.weight. # To prevent errors, skip loading lm_head.weight.
if name == "lm_head.weight": if name == "lm_head.weight":
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
@ -2816,7 +2816,7 @@ class Gemma2Model(Model):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model # lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
# To prevent errors, skip loading lm_head.weight. # To prevent errors, skip loading lm_head.weight.
if name == "lm_head.weight": if name == "lm_head.weight":
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
@ -2894,7 +2894,7 @@ class Rwkv6Model(Model):
self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
# required by llama.cpp, unused # required by jarvis.cpp, unused
self.gguf_writer.add_head_count(0) self.gguf_writer.add_head_count(0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@ -3024,7 +3024,7 @@ class OlmoModel(Model):
self.gguf_writer.add_clamp_kqv(clip_qkv) self.gguf_writer.add_clamp_kqv(clip_qkv)
# Same as super class, but permuting q_proj, k_proj # Same as super class, but permuting q_proj, k_proj
# Copied from: LlamaModel # Copied from: JarvisModel
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused
@ -3032,9 +3032,9 @@ class OlmoModel(Model):
n_kv_head = self.hparams.get("num_key_value_heads") n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith("q_proj.weight"): if name.endswith("q_proj.weight"):
data_torch = LlamaModel.permute(data_torch, n_head, n_head) data_torch = JarvisModel.permute(data_torch, n_head, n_head)
if name.endswith("k_proj.weight"): if name.endswith("k_proj.weight"):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ -3174,12 +3174,12 @@ class OpenELMModel(Model):
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
# Uses the tokenizer from meta-llama/Llama-2-7b-hf # Uses the tokenizer from meta-jarvis/Jarvis-2-7b-hf
def set_vocab(self): def set_vocab(self):
try: try:
self._set_vocab_sentencepiece() self._set_vocab_sentencepiece()
except FileNotFoundError: except FileNotFoundError:
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) self._set_vocab_builtin("jarvis-spm", self.hparams["vocab_size"])
def set_gguf_parameters(self): def set_gguf_parameters(self):
n_embd = self._n_embd n_embd = self._n_embd
@ -3300,7 +3300,7 @@ class ArcticModel(Model):
toktypes[token_id] = token_type toktypes[token_id] = token_type
scores[token_id] = token_score scores[token_id] = token_score
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("jarvis")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_scores(scores)
@ -3322,9 +3322,9 @@ class ArcticModel(Model):
n_kv_head = self.hparams.get("num_key_value_heads") n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith("q_proj.weight"): if name.endswith("q_proj.weight"):
data_torch = LlamaModel.permute(data_torch, n_head, n_head) data_torch = JarvisModel.permute(data_torch, n_head, n_head)
if name.endswith("k_proj.weight"): if name.endswith("k_proj.weight"):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately # process the experts separately
if name.find("block_sparse_moe.experts") != -1: if name.find("block_sparse_moe.experts") != -1:
@ -3882,7 +3882,7 @@ class ChatGLMModel(Model):
scores.append(score) scores.append(score)
toktypes.append(toktype) toktypes.append(toktype)
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("jarvis")
# glm3 needs prefix and suffix formatted as: # glm3 needs prefix and suffix formatted as:
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
self.gguf_writer.add_tokenizer_pre("chatglm-spm") self.gguf_writer.add_tokenizer_pre("chatglm-spm")
@ -4087,7 +4087,7 @@ class ExaoneModel(Model):
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3": if rope_scaling.get("rope_type", '').lower() == "jarvis3":
base = self.hparams.get("rope_theta", 10000.0) base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
@ -4116,12 +4116,12 @@ class ExaoneModel(Model):
@Model.register("GraniteForCausalLM") @Model.register("GraniteForCausalLM")
class GraniteModel(LlamaModel): class GraniteModel(JarvisModel):
"""Conversion for IBM's GraniteForCausalLM""" """Conversion for IBM's GraniteForCausalLM"""
model_arch = gguf.MODEL_ARCH.GRANITE model_arch = gguf.MODEL_ARCH.GRANITE
def set_gguf_parameters(self): def set_gguf_parameters(self):
"""Granite uses standard llama parameters with the following differences: """Granite uses standard jarvis parameters with the following differences:
- No head_dim support - No head_dim support
- New multiplier params: - New multiplier params:
@ -4196,9 +4196,9 @@ class ChameleonModel(Model):
hidden_dim = self.hparams.get("hidden_size") hidden_dim = self.hparams.get("hidden_size")
if name.endswith(("q_proj.weight", "q_proj.bias")): if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head) data_torch = JarvisModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")): if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
if name.endswith(("q_norm.weight", "q_norm.bias")): if name.endswith(("q_norm.weight", "q_norm.bias")):
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
if name.endswith(("k_norm.weight", "k_norm.bias")): if name.endswith(("k_norm.weight", "k_norm.bias")):
@ -4379,14 +4379,14 @@ def main() -> None:
logger.error(f'Error: {args.model} is not a directory') logger.error(f'Error: {args.model} is not a directory')
sys.exit(1) sys.exit(1)
ftype_map: dict[str, gguf.LlamaFileType] = { ftype_map: dict[str, gguf.JarvisFileType] = {
"f32": gguf.LlamaFileType.ALL_F32, "f32": gguf.JarvisFileType.ALL_F32,
"f16": gguf.LlamaFileType.MOSTLY_F16, "f16": gguf.JarvisFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16, "bf16": gguf.JarvisFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, "q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, "tq1_0": gguf.JarvisFileType.MOSTLY_TQ1_0,
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, "tq2_0": gguf.JarvisFileType.MOSTLY_TQ2_0,
"auto": gguf.LlamaFileType.GUESSED, "auto": gguf.JarvisFileType.GUESSED,
} }
is_split = args.split_max_tensors > 0 or args.split_max_size != "0" is_split = args.split_max_tensors > 0 or args.split_max_size != "0"

View file

@ -5,10 +5,10 @@
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
# #
# This is necessary in order to analyze the type of pre-tokenizer used by the model and # This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement # provide the necessary information to jarvis.cpp via the GGUF header in order to implement
# the same pre-tokenizer. # the same pre-tokenizer.
# #
# ref: https://github.com/ggerganov/llama.cpp/pull/6920 # ref: https://github.com/ggerganov/jarvis.cpp/pull/6920
# #
# Instructions: # Instructions:
# #
@ -18,9 +18,9 @@
# python3 convert_hf_to_gguf_update.py <huggingface_token> # python3 convert_hf_to_gguf_update.py <huggingface_token>
# #
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
# - Update llama.cpp with the new pre-tokenizer if necessary # - Update jarvis.cpp with the new pre-tokenizer if necessary
# #
# TODO: generate tokenizer tests for llama.cpp # TODO: generate tokenizer tests for jarvis.cpp
# #
import logging import logging
@ -65,8 +65,8 @@ else:
# TODO: add models here, base models preferred # TODO: add models here, base models preferred
models = [ models = [
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, {"name": "jarvis-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-jarvis/Jarvis-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, {"name": "jarvis-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
@ -86,7 +86,7 @@ models = [
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
@ -215,7 +215,7 @@ src_func = f"""
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model # is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer # use in jarvis.cpp to implement the same pre-tokenizer
chktxt = {repr(CHK_TXT)} chktxt = {repr(CHK_TXT)}
@ -239,7 +239,7 @@ src_func = f"""
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream") logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("** ref: https://github.com/ggerganov/jarvis.cpp/pull/6920")
logger.warning("**") logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}") logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************") logger.warning("**************************************************************************************")
@ -311,7 +311,7 @@ tests = [
"3333333", "3333333",
"33333333", "33333333",
"333333333", "333333333",
"Cửa Việt", # llama-bpe fails on this "Cửa Việt", # jarvis-bpe fails on this
" discards", " discards",
CHK_TXT, CHK_TXT,
] ]

View file

@ -223,13 +223,13 @@ class GGMLToGGUF:
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param" assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
self.n_kv_head = n_kv_head self.n_kv_head = n_kv_head
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer) self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.JARVIS, ggml_model.hyperparameters.n_layer)
def save(self): def save(self):
logger.info('* Preparing to save GGUF file') logger.info('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter( gguf_writer = gguf.GGUFWriter(
self.cfg.output, self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.JARVIS],
use_temp_file = False) use_temp_file = False)
self.add_params(gguf_writer) self.add_params(gguf_writer)
self.add_vocab(gguf_writer) self.add_vocab(gguf_writer)
@ -286,7 +286,7 @@ class GGMLToGGUF:
def add_vocab(self, gguf_writer): def add_vocab(self, gguf_writer):
hp = self.model.hyperparameters hp = self.model.hyperparameters
gguf_writer.add_tokenizer_model('llama') gguf_writer.add_tokenizer_model('jarvis')
gguf_writer.add_tokenizer_pre('default') gguf_writer.add_tokenizer_pre('default')
tokens = [] tokens = []
scores = [] scores = []
@ -358,7 +358,7 @@ class GGMLToGGUF:
def handle_metadata(cfg, hp): def handle_metadata(cfg, hp):
import examples.convert_legacy_llama as convert import examples.convert_legacy_jarvis as convert
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory' assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
hf_config_path = cfg.model_metadata_dir / "config.json" hf_config_path = cfg.model_metadata_dir / "config.json"

View file

@ -271,12 +271,12 @@ if __name__ == '__main__':
args = parse_args() args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
ftype_map: dict[str, gguf.LlamaFileType] = { ftype_map: dict[str, gguf.JarvisFileType] = {
"f32": gguf.LlamaFileType.ALL_F32, "f32": gguf.JarvisFileType.ALL_F32,
"f16": gguf.LlamaFileType.MOSTLY_F16, "f16": gguf.JarvisFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16, "bf16": gguf.JarvisFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, "q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
"auto": gguf.LlamaFileType.GUESSED, "auto": gguf.JarvisFileType.GUESSED,
} }
ftype = ftype_map[args.outtype] ftype = ftype_map[args.outtype]
@ -372,9 +372,9 @@ if __name__ == '__main__':
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
dest = list(super().modify_tensors(data_torch, name, bid)) dest = list(super().modify_tensors(data_torch, name, bid))
# some archs may have the same tensor for lm_head and output (tie word embeddings) # some archs may have the same tensor for lm_head and output (tie word embeddings)
# in this case, adapters targeting lm_head will fail when using llama-export-lora # in this case, adapters targeting lm_head will fail when using jarvis-export-lora
# therefore, we ignore them for now # therefore, we ignore them for now
# see: https://github.com/ggerganov/llama.cpp/issues/9065 # see: https://github.com/ggerganov/jarvis.cpp/issues/9065
if name == "lm_head.weight" and len(dest) == 0: if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model") raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest: for dest_name, dest_data in dest:

View file

@ -5,14 +5,14 @@
[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid. [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell: With Termux, you can install and run `jarvis.cpp` as if the environment were Linux. Once in the Termux shell:
``` ```
$ apt update && apt upgrade -y $ apt update && apt upgrade -y
$ apt install git cmake $ apt install git cmake
``` ```
Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake. Then, follow the [build instructions](https://github.com/ggerganov/jarvis.cpp/blob/master/docs/build.md), specifically for CMake.
Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance: Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
@ -20,22 +20,22 @@ Once the binaries are built, download your model of choice (e.g., from Hugging F
$ curl -L {model-url} -o ~/{model}.gguf $ curl -L {model-url} -o ~/{model}.gguf
``` ```
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and: Then, if you are not already in the repo directory, `cd` into `jarvis.cpp` and:
``` ```
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" $ ./build/bin/jarvis-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
``` ```
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. Here, we show `jarvis-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone: To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
## Cross-compile using Android NDK ## Cross-compile using Android NDK
It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.) It's possible to build `jarvis.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory: Once you're ready and have cloned `jarvis.cpp`, invoke the following in the project directory:
``` ```
$ cmake \ $ cmake \
@ -45,15 +45,15 @@ $ cmake \
-DCMAKE_C_FLAGS="-march=armv8.7a" \ -DCMAKE_C_FLAGS="-march=armv8.7a" \
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \ -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
-DGGML_OPENMP=OFF \ -DGGML_OPENMP=OFF \
-DGGML_LLAMAFILE=OFF \ -DGGML_JARVISFILE=OFF \
-B build-android -B build-android
``` ```
Notes: Notes:
- While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
- `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325) - `jarvisfile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/jarvisfile/issues/325)
The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use. The above command should configure `jarvis.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `jarvis.cpp` includes runtime checks for available CPU features it can use.
Feel free to adjust the Android ABI for your target. Once the project is configured: Feel free to adjust the Android ABI for your target. Once the project is configured:
@ -65,17 +65,17 @@ $ cmake --install build-android --prefix {install-dir} --config Release
After installing, go ahead and download the model of your choice to your host system. Then: After installing, go ahead and download the model of your choice to your host system. Then:
``` ```
$ adb shell "mkdir /data/local/tmp/llama.cpp" $ adb shell "mkdir /data/local/tmp/jarvis.cpp"
$ adb push {install-dir} /data/local/tmp/llama.cpp/ $ adb push {install-dir} /data/local/tmp/jarvis.cpp/
$ adb push {model}.gguf /data/local/tmp/llama.cpp/ $ adb push {model}.gguf /data/local/tmp/jarvis.cpp/
$ adb shell $ adb shell
``` ```
In the `adb shell`: In the `adb shell`:
``` ```
$ cd /data/local/tmp/llama.cpp $ cd /data/local/tmp/jarvis.cpp
$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}" $ LD_LIBRARY_PATH=lib ./bin/jarvis-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
``` ```
That's it! That's it!

View file

@ -25,13 +25,13 @@ sudo make install
We recommend using openmp since it's easier to modify the cores being used. We recommend using openmp since it's easier to modify the cores being used.
### llama.cpp compilation ### jarvis.cpp compilation
Makefile: Makefile:
```bash ```bash
make GGML_BLIS=1 -j make GGML_BLIS=1 -j
# make GGML_BLIS=1 llama-benchmark-matmult # make GGML_BLIS=1 jarvis-benchmark-matmult
``` ```
CMake: CMake:
@ -43,7 +43,7 @@ cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
make -j make -j
``` ```
### llama.cpp execution ### jarvis.cpp execution
According to the BLIS documentation, we could set the following According to the BLIS documentation, we could set the following
environment variables to modify the behavior of openmp: environment variables to modify the behavior of openmp:

View file

@ -1,4 +1,4 @@
# llama.cpp for CANN # jarvis.cpp for CANN
- [Background](#background) - [Background](#background)
- [News](#news) - [News](#news)
@ -17,9 +17,9 @@
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform. **CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
**Llama.cpp + CANN** **Jarvis.cpp + CANN**
The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly. The jarvis.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
## News ## News
@ -78,11 +78,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
| GritLM-7B | √ | √ | √ | | GritLM-7B | √ | √ | √ |
| internlm2_5-7b-chat | √ | √ | √ | | internlm2_5-7b-chat | √ | √ | √ |
| koala-7B-HF | √ | √ | √ | | koala-7B-HF | √ | √ | √ |
| Llama-2-7b-chat-hf | √ | √ | √ | | Jarvis-2-7b-chat-hf | √ | √ | √ |
| Llama-3-Smaug-8B | √ | √ | √ | | Jarvis-3-Smaug-8B | √ | √ | √ |
| Llama2-Chinese-7b-Chat | √ | √ | √ | | Jarvis2-Chinese-7b-Chat | √ | √ | √ |
| Llama3-8B | √ | √ | √ | | Jarvis3-8B | √ | √ | √ |
| Llama3-8b-chinese | √ | √ | √ | | Jarvis3-8b-chinese | √ | √ | √ |
| mamba-130m-hf | √ | √ | √ | | mamba-130m-hf | √ | √ | √ |
| Mistral-7B-Instruct-v0.2 | √ | √ | √ | | Mistral-7B-Instruct-v0.2 | √ | √ | √ |
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ | | Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
@ -120,9 +120,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
## Docker ## Docker
### Build Images ### Build Images
You can get a image with llama.cpp in one command. You can get a image with jarvis.cpp in one command.
```sh ```sh
docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile . docker build -t jarvis-cpp-cann -f .devops/jarvis-cli-cann.Dockerfile .
``` ```
### Run container ### Run container
@ -133,7 +133,7 @@ npu-smi info
# Select the cards that you want to use, make sure these cards are not used by someone. # Select the cards that you want to use, make sure these cards are not used by someone.
# Following using cards of device0. # Following using cards of device0.
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:" docker run --name jarviscpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it jarvis-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
``` ```
*Notes:* *Notes:*
@ -208,7 +208,7 @@ docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager
Upon a successful installation, CANN is enabled for the available ascend devices. Upon a successful installation, CANN is enabled for the available ascend devices.
### II. Build llama.cpp ### II. Build jarvis.cpp
```sh ```sh
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
@ -242,13 +242,13 @@ cmake --build build --config release
- Use device 0: - Use device 0:
```sh ```sh
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
``` ```
- Use multiple devices: - Use multiple devices:
```sh ```sh
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
``` ```
### **GitHub contribution**: ### **GitHub contribution**:

View file

@ -1,4 +1,4 @@
# llama.cpp for SYCL # jarvis.cpp for SYCL
- [Background](#background) - [Background](#background)
- [Recommended Release](#recommended-release) - [Recommended Release](#recommended-release)
@ -24,9 +24,9 @@
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs. - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets. - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
### Llama.cpp + SYCL ### Jarvis.cpp + SYCL
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD. The jarvis.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
## Recommended Release ## Recommended Release
@ -36,7 +36,7 @@ The following release is verified with good quality:
|Commit ID|Tag|Release|Verified Platform| |Commit ID|Tag|Release|Verified Platform|
|-|-|-|-| |-|-|-|-|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[jarvis-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/jarvis.cpp/releases/download/b3038/jarvis-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
## News ## News
@ -46,7 +46,7 @@ The following release is verified with good quality:
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs. - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
- 2024.5 - 2024.5
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770. - Performance is increased: 34 -> 37 tokens/s of jarvis-2-7b.Q4_0 on Arc770.
- Arch Linux is verified successfully. - Arch Linux is verified successfully.
- 2024.4 - 2024.4
@ -54,8 +54,8 @@ The following release is verified with good quality:
- 2024.3 - 2024.3
- Release binary files of Windows. - Release binary files of Windows.
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd). - A blog is published: **Run LLM on all Intel GPUs Using jarvis.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-jarvis-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-jarvis-cpp-fd2e2dcbd9bd).
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437). - New base line is ready: [tag b2437](https://github.com/ggerganov/jarvis.cpp/tree/b2437).
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing. - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE. - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
- Support detecting all GPUs with level-zero and same top **Max compute units**. - Support detecting all GPUs with level-zero and same top **Max compute units**.
@ -100,9 +100,9 @@ SYCL backend supports Intel GPU Family:
*Notes:* *Notes:*
- **Memory** - **Memory**
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`. - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/jarvis-cli`.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *jarvis-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
- **Execution Unit (EU)** - **Execution Unit (EU)**
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use. - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
@ -130,14 +130,14 @@ The docker build option is currently limited to *intel GPU* targets.
### Build image ### Build image
```sh ```sh
# Using FP16 # Using FP16
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile . docker build -t jarvis-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/jarvis-cli-intel.Dockerfile .
``` ```
*Notes*: *Notes*:
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command. To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative. You can also use the `.devops/jarvis-server-intel.Dockerfile`, which builds the *"server"* alternative.
### Run container ### Run container
@ -145,7 +145,7 @@ You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *
# First, find all the DRI cards # First, find all the DRI cards
ls -la /dev/dri ls -la /dev/dri
# Then, pick the card that you want to use (here for e.g. /dev/dri/card1). # Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
``` ```
*Notes:* *Notes:*
@ -276,7 +276,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9] [hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
``` ```
### II. Build llama.cpp ### II. Build jarvis.cpp
#### Intel GPU #### Intel GPU
@ -309,7 +309,7 @@ export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL # Build JARVIS with Nvidia BLAS acceleration through SYCL
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
@ -329,7 +329,7 @@ export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with rocBLAS acceleration through SYCL # Build JARVIS with rocBLAS acceleration through SYCL
## AMD ## AMD
# Use FP32, FP16 is not supported # Use FP32, FP16 is not supported
@ -344,7 +344,7 @@ cmake --build build --config Release -j -v
#### Retrieve and prepare model #### Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.
##### Check device ##### Check device
@ -359,7 +359,7 @@ source /opt/intel/oneapi/setvars.sh
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
```sh ```sh
./build/bin/llama-ls-sycl-device ./build/bin/jarvis-ls-sycl-device
``` ```
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
@ -390,12 +390,12 @@ Choose one of following methods to run.
- Use device 0: - Use device 0:
```sh ```sh
./examples/sycl/run-llama2.sh 0 ./examples/sycl/run-jarvis2.sh 0
``` ```
- Use multiple devices: - Use multiple devices:
```sh ```sh
./examples/sycl/run-llama2.sh ./examples/sycl/run-jarvis2.sh
``` ```
2. Command line 2. Command line
@ -418,13 +418,13 @@ Examples:
- Use device 0: - Use device 0:
```sh ```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
``` ```
- Use multiple devices: - Use multiple devices:
```sh ```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
``` ```
*Notes:* *Notes:*
@ -492,7 +492,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/) b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
### II. Build llama.cpp ### II. Build jarvis.cpp
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files. You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
@ -506,7 +506,7 @@ Choose one of following methods to build from source code.
2. CMake 2. CMake
On the oneAPI command line window, step into the llama.cpp main directory and run the following: On the oneAPI command line window, step into the jarvis.cpp main directory and run the following:
``` ```
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
@ -524,34 +524,34 @@ Or, use CMake presets to build:
```sh ```sh
cmake --preset x64-windows-sycl-release cmake --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli cmake --build build-x64-windows-sycl-release -j --target jarvis-cli
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli cmake --build build-x64-windows-sycl-release -j --target jarvis-cli
cmake --preset x64-windows-sycl-debug cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-cli cmake --build build-x64-windows-sycl-debug -j --target jarvis-cli
``` ```
3. Visual Studio 3. Visual Studio
You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. You can use Visual Studio to open jarvis.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
*Notes:* *Notes:*
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`. - In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target jarvis-cli`.
### III. Run the inference ### III. Run the inference
#### Retrieve and prepare model #### Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.
##### Check device ##### Check device
1. Enable oneAPI running environment 1. Enable oneAPI running environment
On the oneAPI command line window, run the following and step into the llama.cpp directory: On the oneAPI command line window, run the following and step into the jarvis.cpp directory:
``` ```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
``` ```
@ -561,7 +561,7 @@ On the oneAPI command line window, run the following and step into the llama.cpp
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
``` ```
build\bin\llama-ls-sycl-device.exe build\bin\jarvis-ls-sycl-device.exe
``` ```
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
@ -589,7 +589,7 @@ Choose one of following methods to run.
1. Script 1. Script
``` ```
examples\sycl\win-run-llama2.bat examples\sycl\win-run-jarvis2.bat
``` ```
2. Command line 2. Command line
@ -613,13 +613,13 @@ Examples:
- Use device 0: - Use device 0:
``` ```
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0 build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
``` ```
- Use multiple devices: - Use multiple devices:
``` ```
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
``` ```
@ -682,13 +682,13 @@ use 1 SYCL GPUs: [0] with Max compute units:512
``` ```
Otherwise, please double-check the GPU driver installation steps. Otherwise, please double-check the GPU driver installation steps.
- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend? - Can I report Ojarvis issue on Intel GPU to jarvis.cpp SYCL backend?
No. We can't support Ollama issue directly, because we aren't familiar with Ollama. No. We can't support Ojarvis issue directly, because we aren't familiar with Ojarvis.
Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it. Sugguest reproducing on jarvis.cpp and report similar issue to jarvis.cpp. We will surpport it.
It's same for other projects including llama.cpp SYCL backend. It's same for other projects including jarvis.cpp SYCL backend.
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer` - Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`

View file

@ -1,13 +1,13 @@
# Build llama.cpp locally # Build jarvis.cpp locally
**To get the Code:** **To get the Code:**
```bash ```bash
git clone https://github.com/ggerganov/llama.cpp git clone https://github.com/ggerganov/jarvis.cpp
cd llama.cpp cd jarvis.cpp
``` ```
In order to build llama.cpp you have four different options. In order to build jarvis.cpp you have four different options.
- Using `make`: - Using `make`:
- On Linux or MacOS: - On Linux or MacOS:
@ -21,17 +21,17 @@ In order to build llama.cpp you have four different options.
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Extract `w64devkit` on your pc. 2. Extract `w64devkit` on your pc.
3. Run `w64devkit.exe`. 3. Run `w64devkit.exe`.
4. Use the `cd` command to reach the `llama.cpp` folder. 4. Use the `cd` command to reach the `jarvis.cpp` folder.
5. From here you can run: 5. From here you can run:
```bash ```bash
make make
``` ```
- Notes: - Notes:
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`. - For `Q4_0_4_4` quantization type build, add the `GGML_NO_JARVISFILE=1` flag. For example, use `make GGML_NO_JARVISFILE=1`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/). - For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1` - For debug builds, run `make JARVIS_DEBUG=1`
- Using `CMake`: - Using `CMake`:
@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
**Notes**: **Notes**:
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`. - For `Q4_0_4_4` quantization type build, add the `-DGGML_JARVISFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_JARVISFILE=OFF`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/). - For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases: - For debug builds, there are two cases:
@ -118,7 +118,7 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`. 4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`. 5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
6. Run `w64devkit.exe`. 6. Run `w64devkit.exe`.
7. Use the `cd` command to reach the `llama.cpp` folder. 7. Use the `cd` command to reach the `jarvis.cpp` folder.
8. From here you can run: 8. From here you can run:
```bash ```bash
@ -140,13 +140,13 @@ Check [BLIS.md](./backend/BLIS.md) for more information.
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). jarvis.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md). For detailed info, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).
### Intel oneMKL ### Intel oneMKL
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md). Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).
- Using manual oneAPI installation: - Using manual oneAPI installation:
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
@ -159,7 +159,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
- Using oneAPI docker image: - Using oneAPI docker image:
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above. If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-jarvis2-on-intel-cpu.html) for more information.
### CUDA ### CUDA
@ -300,7 +300,7 @@ Libs: -lvulkan-1
EOF EOF
``` ```
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`. Switch into the `jarvis.cpp` directory and run `make GGML_VULKAN=1`.
#### MSYS2 #### MSYS2
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies. Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
@ -311,7 +311,7 @@ Install [MSYS2](https://www.msys2.org/) and then run the following commands in a
mingw-w64-ucrt-x86_64-vulkan-devel \ mingw-w64-ucrt-x86_64-vulkan-devel \
mingw-w64-ucrt-x86_64-shaderc mingw-w64-ucrt-x86_64-shaderc
``` ```
Switch into `llama.cpp` directory and build using CMake. Switch into `jarvis.cpp` directory and build using CMake.
```sh ```sh
cmake -B build -DGGML_VULKAN=ON cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release cmake --build build --config Release
@ -323,10 +323,10 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
```sh ```sh
# Build the image # Build the image
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile . docker build -t jarvis-cpp-vulkan -f .devops/jarvis-cli-vulkan.Dockerfile .
# Then, use it: # Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
``` ```
**Without docker**: **Without docker**:
@ -348,13 +348,13 @@ Alternatively your package manager might be able to provide the appropriate libr
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages. For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Then, build llama.cpp using the cmake command below: Then, build jarvis.cpp using the cmake command below:
```bash ```bash
cmake -B build -DGGML_VULKAN=1 cmake -B build -DGGML_VULKAN=1
cmake --build build --config Release cmake --build build --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU) # Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 ./bin/jarvis-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example: # You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@ -367,7 +367,7 @@ For more information about Ascend NPU in [Ascend Community](https://www.hiascend
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann) Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
Go to `llama.cpp` directory and build using CMake. Go to `jarvis.cpp` directory and build using CMake.
```bash ```bash
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
cmake --build build --config release cmake --build build --config release
@ -375,15 +375,15 @@ cmake --build build --config release
You can test with: You can test with:
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32` `./build/jarvis-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`: If the fllowing info is output on screen, you are using `jarvis.cpp by CANN backend`:
```bash ```bash
llm_load_tensors: CANN buffer size = 13313.00 MiB llm_load_tensors: CANN buffer size = 13313.00 MiB
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB jarvis_new_context_with_model: CANN compute buffer size = 1260.81 MiB
``` ```
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md). For detailed info, such as model/device supports, CANN install, please refer to [jarvis.cpp for CANN](./backend/CANN.md).
### Android ### Android
@ -391,6 +391,6 @@ To read documentation for how to build on Android, [click here](./android.md)
### Arm CPU optimized mulmat kernels ### Arm CPU optimized mulmat kernels
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. Jarvis.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`). To support `Q4_0_4_4`, you must build with `GGML_NO_JARVISFILE=1` (`make`) or `-DGGML_JARVISFILE=OFF` (`cmake`).

View file

@ -1,9 +1,9 @@
# Add a new model architecture to `llama.cpp` # Add a new model architecture to `jarvis.cpp`
Adding a model requires few steps: Adding a model requires few steps:
1. Convert the model to GGUF 1. Convert the model to GGUF
2. Define the model architecture in `llama.cpp` 2. Define the model architecture in `jarvis.cpp`
3. Build the GGML graph implementation 3. Build the GGML graph implementation
After following these steps, you can open PR. After following these steps, you can open PR.
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
### 1. Convert the model to GGUF ### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format). Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_jarvis.py](/examples/convert_legacy_jarvis.py) (for `jarvis/jarvis2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
@ -81,26 +81,26 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights. NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
### 2. Define the model architecture in `llama.cpp` ### 2. Define the model architecture in `jarvis.cpp`
The model params and tensors layout must be defined in `llama.cpp`: The model params and tensors layout must be defined in `jarvis.cpp`:
1. Define a new `llm_arch` 1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES` 2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non standard metadata in `llm_load_hparams` 3. Add any non standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors` 4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type` 5. If the model has a RoPE operation, add the rope type in `jarvis_rope_type`
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions. NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
### 3. Build the GGML graph implementation ### 3. Build the GGML graph implementation
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `jarvis_build_graph`.
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`. Have a look at existing implementation like `build_jarvis`, `build_dbrx` or `build_bert`.
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/). Note: to debug the inference graph: you can use [jarvis-eval-callback](/examples/eval-callback/).
## GGUF specification ## GGUF specification
@ -108,12 +108,12 @@ https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
## Resources ## Resources
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268 - YaRN RoPE scaling https://github.com/ggerganov/jarvis.cpp/pull/2268
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009 - support Baichuan serial models https://github.com/ggerganov/jarvis.cpp/pull/3009
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283 - support attention bias https://github.com/ggerganov/jarvis.cpp/pull/4283
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406 - Mixtral support https://github.com/ggerganov/jarvis.cpp/pull/4406
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423 - BERT embeddings https://github.com/ggerganov/jarvis.cpp/pull/5423
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204 - Grok-1 support https://github.com/ggerganov/jarvis.cpp/pull/6204
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491 - Command R Plus support https://github.com/ggerganov/jarvis.cpp/pull/6491
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515 - support arch DBRX https://github.com/ggerganov/jarvis.cpp/pull/6515
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948 - How to convert HuggingFace model to GGUF format https://github.com/ggerganov/jarvis.cpp/discussions/2948

View file

@ -51,7 +51,7 @@ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults. Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
```bash ```bash
cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON .. cmake -DCMAKE_BUILD_TYPE=Debug -DJARVIS_CUDA=1 -DJARVIS_FATAL_WARNINGS=ON ..
make -j make -j
``` ```
@ -71,12 +71,12 @@ This may return output similar to below (focusing on key lines to pay attention
```bash ```bash
... ...
1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf" 1: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
1: Working Directory: . 1: Working Directory: .
Labels: main Labels: main
Test #1: test-tokenizer-0-llama-spm Test #1: test-tokenizer-0-jarvis-spm
... ...
4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf" 4: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-falcon.gguf"
4: Working Directory: . 4: Working Directory: .
Labels: main Labels: main
Test #4: test-tokenizer-0-falcon Test #4: test-tokenizer-0-falcon
@ -86,8 +86,8 @@ Labels: main
#### Step 4: Identify Test Command for Debugging #### Step 4: Identify Test Command for Debugging
So for test #1 above we can tell these two pieces of relevant information: So for test #1 above we can tell these two pieces of relevant information:
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0` * Test Binary: `~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0`
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf` * Test GGUF Model: `~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf`
#### Step 5: Run GDB on test command #### Step 5: Run GDB on test command
@ -100,5 +100,5 @@ gdb --args ${Test Binary} ${Test GGUF Model}
Example: Example:
```bash ```bash
gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf" gdb --args ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
``` ```

View file

@ -1,23 +1,23 @@
# Token generation performance troubleshooting # Token generation performance troubleshooting
## Verifying that the model is running on the GPU with CUDA ## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: Make sure you compiled jarvis with the correct env variables according to [this guide](/docs/build.md#cuda), so that jarvis accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running jarvis, you may configure `N` to be very large, and jarvis will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell ```shell
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ./jarvis-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
``` ```
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: When running jarvis, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
```shell ```shell
llama_model_load_internal: [cublas] offloading 60 layers to GPU jarvis_model_load_internal: [cublas] offloading 60 layers to GPU
llama_model_load_internal: [cublas] offloading output layer to GPU jarvis_model_load_internal: [cublas] offloading output layer to GPU
llama_model_load_internal: [cublas] total VRAM used: 17223 MB jarvis_model_load_internal: [cublas] total VRAM used: 17223 MB
... rest of inference ... rest of inference
``` ```
If you see these lines, then the GPU is being used. If you see these lines, then the GPU is being used.
## Verifying that the CPU is not oversaturated ## Verifying that the CPU is not oversaturated
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down. jarvis accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
# Example of runtime flags effect on inference speed benchmark # Example of runtime flags effect on inference speed benchmark
These runs were tested on the following machine: These runs were tested on the following machine:
@ -27,7 +27,7 @@ RAM: 32GB
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML) Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` Run command: `./jarvis-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
Result: Result:

View file

@ -2,26 +2,26 @@
## Prerequisites ## Prerequisites
* Docker must be installed and running on your system. * Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (ex. /llama/models) * Create a folder to store big models & intermediate files (ex. /jarvis/models)
## Images ## Images
We have three Docker images available for this project: We have three Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) 1. `ghcr.io/ggerganov/jarvis.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) 2. `ghcr.io/ggerganov/jarvis.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) 3. `ghcr.io/ggerganov/jarvis.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above: Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/jarvis.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/jarvis.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/jarvis.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/jarvis.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/jarvis.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/jarvis.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/jarvis.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/jarvis.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/jarvis.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
Replace `/path/to/models` below with the actual path where you downloaded the models. Replace `/path/to/models` below with the actual path where you downloaded the models.
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --all-in-one "/models/" 7B
``` ```
On completion, you are ready to play! On completion, you are ready to play!
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
``` ```
or with a light image: or with a light image:
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
``` ```
or with a server image: or with a server image:
```bash ```bash
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/jarvis.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
``` ```
## Docker With CUDA ## Docker With CUDA
@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
## Building Docker locally ## Building Docker locally
```bash ```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . docker build -t local/jarvis.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile . docker build -t local/jarvis.cpp:light-cuda -f .devops/jarvis-cli-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile . docker build -t local/jarvis.cpp:server-cuda -f .devops/jarvis-server-cuda.Dockerfile .
``` ```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -74,18 +74,18 @@ The defaults are:
The resulting images, are essentially the same as the non-CUDA images: The resulting images, are essentially the same as the non-CUDA images:
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 1. `local/jarvis.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. 2. `local/jarvis.cpp:light-cuda`: This image only includes the main executable file.
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. 3. `local/jarvis.cpp:server-cuda`: This image only includes the server executable file.
## Usage ## Usage
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag. After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
```bash ```bash
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
``` ```
## Docker With MUSA ## Docker With MUSA
@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
## Building Docker locally ## Building Docker locally
```bash ```bash
docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile . docker build -t local/jarvis.cpp:full-musa -f .devops/full-musa.Dockerfile .
docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile . docker build -t local/jarvis.cpp:light-musa -f .devops/jarvis-cli-musa.Dockerfile .
docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile . docker build -t local/jarvis.cpp:server-musa -f .devops/jarvis-server-musa.Dockerfile .
``` ```
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture. You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
@ -108,16 +108,16 @@ The defaults are:
The resulting images, are essentially the same as the non-MUSA images: The resulting images, are essentially the same as the non-MUSA images:
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 1. `local/jarvis.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-musa`: This image only includes the main executable file. 2. `local/jarvis.cpp:light-musa`: This image only includes the main executable file.
3. `local/llama.cpp:server-musa`: This image only includes the server executable file. 3. `local/jarvis.cpp:server-musa`: This image only includes the server executable file.
## Usage ## Usage
After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag. After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
```bash ```bash
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run -v /path/to/models:/models local/jarvis.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run -v /path/to/models:/models local/jarvis.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 docker run -v /path/to/models:/models local/jarvis.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
``` ```

View file

@ -1,39 +1,39 @@
# Install pre-built version of llama.cpp # Install pre-built version of jarvis.cpp
## Homebrew ## Homebrew
On Mac and Linux, the homebrew package manager can be used via On Mac and Linux, the homebrew package manager can be used via
```sh ```sh
brew install llama.cpp brew install jarvis.cpp
``` ```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668 The formula is automatically updated with new `jarvis.cpp` releases. More info: https://github.com/ggerganov/jarvis.cpp/discussions/7668
## Nix ## Nix
On Mac and Linux, the Nix package manager can be used via On Mac and Linux, the Nix package manager can be used via
```sh ```sh
nix profile install nixpkgs#llama-cpp nix profile install nixpkgs#jarvis-cpp
``` ```
For flake enabled installs. For flake enabled installs.
Or Or
```sh ```sh
nix-env --file '<nixpkgs>' --install --attr llama-cpp nix-env --file '<nixpkgs>' --install --attr jarvis-cpp
``` ```
For non-flake enabled installs. For non-flake enabled installs.
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164). This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/jarvis-cpp/package.nix#L164).
## Flox ## Flox
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via On Mac and Linux, Flox can be used to install jarvis.cpp within a Flox environment via
```sh ```sh
flox install llama-cpp flox install jarvis-cpp
``` ```
Flox follows the nixpkgs build of llama.cpp. Flox follows the nixpkgs build of jarvis.cpp.

View file

@ -13,10 +13,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN) if (EMSCRIPTEN)
else() else()
add_subdirectory(cvector-generator) add_subdirectory(cvector-generator)
add_subdirectory(baby-llama) add_subdirectory(baby-jarvis)
add_subdirectory(batched-bench) add_subdirectory(batched-bench)
add_subdirectory(batched) add_subdirectory(batched)
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-jarvis2c-to-ggml)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(eval-callback) add_subdirectory(eval-callback)
add_subdirectory(export-lora) add_subdirectory(export-lora)
@ -27,7 +27,7 @@ else()
add_subdirectory(gritlm) add_subdirectory(gritlm)
add_subdirectory(imatrix) add_subdirectory(imatrix)
add_subdirectory(infill) add_subdirectory(infill)
add_subdirectory(llama-bench) add_subdirectory(jarvis-bench)
add_subdirectory(llava) add_subdirectory(llava)
add_subdirectory(lookahead) add_subdirectory(lookahead)
add_subdirectory(lookup) add_subdirectory(lookup)
@ -41,7 +41,7 @@ else()
if (GGML_RPC) if (GGML_RPC)
add_subdirectory(rpc) add_subdirectory(rpc)
endif() endif()
if (LLAMA_BUILD_SERVER) if (JARVIS_BUILD_SERVER)
add_subdirectory(server) add_subdirectory(server)
endif() endif()
if (GGML_SYCL) if (GGML_SYCL)

View file

@ -2,7 +2,7 @@
set -e set -e
AI_NAME="${AI_NAME:-Miku}" AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}" MODEL="${MODEL:-./models/jarvis-2-7b-chat.ggmlv3.q4_K_M.bin}"
USER_NAME="${USER_NAME:-Anon}" USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use. # Uncomment and adjust to the number of CPU cores you want to use.
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD") GEN_OPTIONS+=(--threads "$N_THREAD")
fi fi
./llama-cli "${GEN_OPTIONS[@]}" \ ./jarvis-cli "${GEN_OPTIONS[@]}" \
--model "$MODEL" \ --model "$MODEL" \
--in-prefix " " \ --in-prefix " " \
--in-suffix "${AI_NAME}:" \ --in-suffix "${AI_NAME}:" \

View file

@ -1,5 +1,5 @@
set(TARGET llama-baby-llama) set(TARGET jarvis-baby-jarvis)
add_executable(${TARGET} baby-llama.cpp) add_executable(${TARGET} baby-jarvis.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -11,8 +11,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#ifdef LLAMA_DEFAULT_RMS_EPS #ifdef JARVIS_DEFAULT_RMS_EPS
constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; constexpr float rms_norm_eps = JARVIS_DEFAULT_RMS_EPS;
#else #else
constexpr float rms_norm_eps = 5e-6f; constexpr float rms_norm_eps = 5e-6f;
#endif #endif
@ -71,7 +71,7 @@ static struct ggml_tensor * randomize_tensor(
return tensor; return tensor;
} }
struct llama_hparams { struct jarvis_hparams {
uint32_t n_vocab = 32000; uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096; uint32_t n_embd = 4096;
@ -80,17 +80,17 @@ struct llama_hparams {
uint32_t n_layer = 32; uint32_t n_layer = 32;
uint32_t n_rot = 64; uint32_t n_rot = 64;
bool operator!=(const llama_hparams & other) const { bool operator!=(const jarvis_hparams & other) const {
return memcmp(this, &other, sizeof(llama_hparams)); return memcmp(this, &other, sizeof(jarvis_hparams));
} }
}; };
static uint32_t get_n_ff(const struct llama_hparams* hparams) { static uint32_t get_n_ff(const struct jarvis_hparams* hparams) {
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
return n_ff; return n_ff;
} }
struct llama_hparams_lora { struct jarvis_hparams_lora {
uint32_t n_vocab = 32000; uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096; uint32_t n_embd = 4096;
@ -100,12 +100,12 @@ struct llama_hparams_lora {
uint32_t n_rot = 64; uint32_t n_rot = 64;
uint32_t n_lora = 64; uint32_t n_lora = 64;
bool operator!=(const llama_hparams_lora & other) const { bool operator!=(const jarvis_hparams_lora & other) const {
return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0; return memcmp(this, &other, sizeof(jarvis_hparams_lora)) != 0;
} }
}; };
struct llama_layer { struct jarvis_layer {
// normalization // normalization
struct ggml_tensor * attention_norm; struct ggml_tensor * attention_norm;
@ -124,7 +124,7 @@ struct llama_layer {
struct ggml_tensor * w3; struct ggml_tensor * w3;
}; };
struct llama_layer_lora { struct jarvis_layer_lora {
// normalization // normalization
struct ggml_tensor * attention_norm; struct ggml_tensor * attention_norm;
@ -148,34 +148,34 @@ struct llama_layer_lora {
}; };
struct llama_kv_cache { struct jarvis_kv_cache {
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
struct ggml_tensor * k; struct ggml_tensor * k;
struct ggml_tensor * v; struct ggml_tensor * v;
// llama_ctx_buffer buf; // jarvis_ctx_buffer buf;
int n; // number of tokens currently in the cache int n; // number of tokens currently in the cache
}; };
struct llama_model { struct jarvis_model {
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
llama_hparams hparams; jarvis_hparams hparams;
struct ggml_tensor * tok_embeddings; struct ggml_tensor * tok_embeddings;
struct ggml_tensor * norm; struct ggml_tensor * norm;
struct ggml_tensor * output; struct ggml_tensor * output;
std::vector<llama_layer> layers; std::vector<jarvis_layer> layers;
}; };
struct llama_model_lora { struct jarvis_model_lora {
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
llama_hparams_lora hparams; jarvis_hparams_lora hparams;
struct ggml_tensor * tok_embeddings; struct ggml_tensor * tok_embeddings;
@ -183,10 +183,10 @@ struct llama_model_lora {
struct ggml_tensor * outputa; struct ggml_tensor * outputa;
struct ggml_tensor * outputb; struct ggml_tensor * outputb;
std::vector<llama_layer_lora> layers; std::vector<jarvis_layer_lora> layers;
}; };
static void init_model(struct llama_model * model) { static void init_model(struct jarvis_model * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -223,7 +223,7 @@ static void init_model(struct llama_model * model) {
} }
static void init_model_lora(struct llama_model_lora * model) { static void init_model_lora(struct jarvis_model_lora * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -266,7 +266,7 @@ static void init_model_lora(struct llama_model_lora * model) {
} }
} }
static void set_param_model(struct llama_model * model) { static void set_param_model(struct jarvis_model * model) {
const auto& hparams = model->hparams; const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -292,7 +292,7 @@ static void set_param_model(struct llama_model * model) {
} }
} }
static void set_param_model_lora(struct llama_model_lora * model) { static void set_param_model_lora(struct jarvis_model_lora * model) {
const auto& hparams = model->hparams; const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -323,7 +323,7 @@ static void set_param_model_lora(struct llama_model_lora * model) {
} }
} }
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { static void randomize_model(struct jarvis_model * model, int seed, float mean, float std, float min, float max) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -355,7 +355,7 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
static void randomize_model_lora( static void randomize_model_lora(
struct llama_model_lora * model, int seed, float mean, float std, float min, float max struct jarvis_model_lora * model, int seed, float mean, float std, float min, float max
) { ) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
@ -391,7 +391,7 @@ static void randomize_model_lora(
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
} }
static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { static void init_kv_cache(struct jarvis_kv_cache* cache, struct jarvis_model * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -425,7 +425,7 @@ static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
} }
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { static bool init_kv_cache_lora(struct jarvis_kv_cache* cache, struct jarvis_model_lora * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -462,8 +462,8 @@ static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_
} }
static struct ggml_tensor * forward( static struct ggml_tensor * forward(
struct llama_model * model, struct jarvis_model * model,
struct llama_kv_cache * cache, struct jarvis_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
@ -472,7 +472,7 @@ static struct ggml_tensor * forward(
) { ) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct jarvis_kv_cache& kv_self = *cache;
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const int n_ctx = hparams.n_ctx; const int n_ctx = hparams.n_ctx;
const int n_embd = hparams.n_embd; const int n_embd = hparams.n_embd;
@ -692,8 +692,8 @@ static struct ggml_tensor * forward(
} }
static struct ggml_tensor * forward_batch( static struct ggml_tensor * forward_batch(
struct llama_model * model, struct jarvis_model * model,
struct llama_kv_cache * cache, struct jarvis_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
@ -703,7 +703,7 @@ static struct ggml_tensor * forward_batch(
) { ) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct jarvis_kv_cache& kv_self = *cache;
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const int n_ctx = hparams.n_ctx; const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
@ -989,8 +989,8 @@ static struct ggml_tensor * forward_batch(
} }
static struct ggml_tensor * forward_lora( static struct ggml_tensor * forward_lora(
struct llama_model_lora * model, struct jarvis_model_lora * model,
struct llama_kv_cache * cache, struct jarvis_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
@ -999,7 +999,7 @@ static struct ggml_tensor * forward_lora(
) { ) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct jarvis_kv_cache& kv_self = *cache;
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const int n_ctx = hparams.n_ctx; const int n_ctx = hparams.n_ctx;
@ -1444,7 +1444,7 @@ int main(int argc, char ** argv) {
lcparams.mem_buffer = NULL; lcparams.mem_buffer = NULL;
lcparams.no_alloc = false; lcparams.no_alloc = false;
struct llama_model model; struct jarvis_model model;
model.hparams.n_vocab = 8; model.hparams.n_vocab = 8;
model.hparams.n_ctx = 8; model.hparams.n_ctx = 8;
model.hparams.n_embd = 32; model.hparams.n_embd = 32;
@ -1467,7 +1467,7 @@ int main(int argc, char ** argv) {
randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f); randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
/* /*
struct llama_model_lora model_lora; struct jarvis_model_lora model_lora;
// model.hparams.n_vocab = 6; // model.hparams.n_vocab = 6;
// model.hparams.n_ctx = 64; // model.hparams.n_ctx = 64;
// model.hparams.n_embd = 128; // model.hparams.n_embd = 128;
@ -1501,7 +1501,7 @@ int main(int argc, char ** argv) {
*/ */
int n_batch = 8; int n_batch = 8;
// key + value cache for the self attention // key + value cache for the self attention
struct llama_kv_cache kv_self; struct jarvis_kv_cache kv_self;
printf("init_kv_cache\n"); printf("init_kv_cache\n");
kv_self.ctx = model.ctx; kv_self.ctx = model.ctx;
init_kv_cache(&kv_self, &model, n_batch); init_kv_cache(&kv_self, &model, n_batch);
@ -1533,7 +1533,7 @@ int main(int argc, char ** argv) {
int n_past = 0; int n_past = 0;
struct ggml_cgraph * gf = NULL; struct ggml_cgraph * gf = NULL;
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets); get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
@ -1601,7 +1601,7 @@ int main(int argc, char ** argv) {
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph * gf = NULL; struct ggml_cgraph * gf = NULL;
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);
int n_past = 0; int n_past = 0;
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past); struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);

View file

@ -5,7 +5,7 @@
# #
# Usage: # Usage:
# #
# cd llama.cpp # cd jarvis.cpp
# make -j # make -j
# #
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args] # ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
@ -21,7 +21,7 @@ if [ $# -gt 2 ]; then
eargs="${@:3}" eargs="${@:3}"
fi fi
ftmp="__llama.cpp_example_tmp__.txt" ftmp="__jarvis.cpp_example_tmp__.txt"
trap "rm -f $ftmp" EXIT trap "rm -f $ftmp" EXIT
echo "Translate from English to French: echo "Translate from English to French:
@ -58,4 +58,4 @@ echo "$2
model=$1 model=$1
# generate the most likely continuation until the string "===" is found # generate the most likely continuation until the string "===" is found
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs ./jarvis-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View file

@ -1,5 +1,5 @@
set(TARGET llama-batched-bench) set(TARGET jarvis-batched-bench)
add_executable(${TARGET} batched-bench.cpp) add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,6 +1,6 @@
# llama.cpp/example/batched-bench # jarvis.cpp/example/batched-bench
Benchmark the batched decoding performance of `llama.cpp` Benchmark the batched decoding performance of `jarvis.cpp`
## Usage ## Usage
@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash ```bash
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] ./jarvis-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 ./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps ./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
# custom set of batches # custom set of batches
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 ./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
``` ```
## Sample results ## Sample results

View file

@ -1,7 +1,7 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
#include "llama.h" #include "jarvis.h"
#include <algorithm> #include <algorithm>
#include <cstdio> #include <cstdio>
@ -17,7 +17,7 @@ static void print_usage(int, char ** argv) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
common_params params; common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_BENCH, print_usage)) {
return 1; return 1;
} }
@ -31,42 +31,42 @@ int main(int argc, char ** argv) {
// init LLM // init LLM
llama_backend_init(); jarvis_backend_init();
llama_numa_init(params.numa); jarvis_numa_init(params.numa);
// initialize the model // initialize the model
llama_model_params model_params = common_model_params_to_llama(params); jarvis_model_params model_params = common_model_params_to_jarvis(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1; return 1;
} }
llama_context_params ctx_params = common_context_params_to_llama(params); jarvis_context_params ctx_params = common_context_params_to_jarvis(params);
// ensure enough sequences are available // ensure enough sequences are available
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
llama_context * ctx = llama_new_context_with_model(model, ctx_params); jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); fprintf(stderr , "%s: error: failed to create the jarvis_context\n" , __func__);
return 1; return 1;
} }
const int32_t n_kv_max = llama_n_ctx(ctx); const int32_t n_kv_max = jarvis_n_ctx(ctx);
llama_batch batch = llama_batch_init(n_kv_max, 0, 1); jarvis_batch batch = jarvis_batch_init(n_kv_max, 0, 1);
// decode in batches of ctx_params.n_batch tokens // decode in batches of ctx_params.n_batch tokens
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) { auto decode_helper = [](jarvis_context * ctx, jarvis_batch & batch, int32_t n_batch) {
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = { jarvis_batch batch_view = {
n_tokens, n_tokens,
batch.token + i, batch.token + i,
nullptr, nullptr,
@ -76,13 +76,13 @@ int main(int argc, char ** argv) {
batch.logits + i, batch.logits + i,
}; };
const int ret = llama_decode(ctx, batch_view); const int ret = jarvis_decode(ctx, batch_view);
if (ret != 0) { if (ret != 0) {
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false; return false;
} }
llama_synchronize(ctx); jarvis_synchronize(ctx);
} }
return true; return true;
@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: jarvis_decode() failed\n", __func__);
return 1; return 1;
} }
} }
@ -132,16 +132,16 @@ int main(int argc, char ** argv) {
const auto t_pp_start = ggml_time_us(); const auto t_pp_start = ggml_time_us();
llama_kv_cache_clear(ctx); jarvis_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: jarvis_decode() failed\n", __func__);
return 1; return 1;
} }
if (is_pp_shared) { if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) { for (int32_t i = 1; i < pl; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1);
} }
} }
@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: jarvis_decode() failed\n", __func__);
return 1; return 1;
} }
} }
@ -189,14 +189,14 @@ int main(int argc, char ** argv) {
} }
LOG("\n"); LOG("\n");
llama_perf_context_print(ctx); jarvis_perf_context_print(ctx);
llama_batch_free(batch); jarvis_batch_free(batch);
llama_free(ctx); jarvis_free(ctx);
llama_free_model(model); jarvis_free_model(model);
llama_backend_free(); jarvis_backend_free();
LOG("\n\n"); LOG("\n\n");

View file

@ -1,6 +1,6 @@
.PHONY: build .PHONY: build
build: build:
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build xcodebuild -scheme jarvis-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
rm -f ./llama-batched-swift rm -f ./jarvis-batched-swift
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift ln -s ./build/Build/Products/Debug/jarvis-batched-swift ./jarvis-batched-swift

View file

@ -4,17 +4,17 @@
import PackageDescription import PackageDescription
let package = Package( let package = Package(
name: "llama-batched-swift", name: "jarvis-batched-swift",
platforms: [.macOS(.v12)], platforms: [.macOS(.v12)],
dependencies: [ dependencies: [
.package(name: "llama", path: "../../"), .package(name: "jarvis", path: "../../"),
], ],
targets: [ targets: [
// Targets are the basic building blocks of a package, defining a module or a test suite. // Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies. // Targets can depend on other targets in this package and products from dependencies.
.executableTarget( .executableTarget(
name: "llama-batched-swift", name: "jarvis-batched-swift",
dependencies: ["llama"], dependencies: ["jarvis"],
path: "Sources", path: "Sources",
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")] linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
), ),

Some files were not shown because too many files have changed in this diff Show more